35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
147 template<
typename MT1
181 template<
typename T1,
typename T2,
typename T3 >
182 struct CanExploitSymmetry {
183 enum :
bool { value = IsColumnMajorMatrix<T1>::value &&
184 ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
195 template<
typename T1,
typename T2,
typename T3 >
196 struct IsEvaluationRequired {
197 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
198 !CanExploitSymmetry<T1,T2,T3>::value };
208 template<
typename T1,
typename T2,
typename T3 >
209 struct UseBlasKernel {
211 HasMutableDataAccess<T1>::value &&
212 HasConstDataAccess<T2>::value &&
213 HasConstDataAccess<T3>::value &&
214 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
215 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
216 IsBLASCompatible< ElementType_<T1> >::value &&
217 IsBLASCompatible< ElementType_<T2> >::value &&
218 IsBLASCompatible< ElementType_<T3> >::value &&
219 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
220 IsSame< ElementType_<T1>, ElementType_<T3> >::value };
230 template<
typename T1,
typename T2,
typename T3 >
231 struct UseVectorizedDefaultKernel {
233 !IsDiagonal<T3>::value &&
234 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
235 AreSIMDCombinable< ElementType_<T1>
237 , ElementType_<T3> >::value &&
238 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
239 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
271 MT1::simdEnabled && MT2::simdEnabled &&
276 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
277 !evaluateRight && MT2::smpAssignable };
332 :(
lhs_.columns() ) ) );
336 const size_t n(
end - begin );
354 inline ReturnType
at(
size_t i,
size_t j )
const {
355 if( i >=
lhs_.rows() ) {
358 if( j >=
rhs_.columns() ) {
370 inline size_t rows() const noexcept {
381 return rhs_.columns();
411 template<
typename T >
412 inline bool canAlias(
const T* alias )
const noexcept {
413 return (
lhs_.canAlias( alias ) ||
rhs_.canAlias( alias ) );
423 template<
typename T >
424 inline bool isAliased(
const T* alias )
const noexcept {
425 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
435 return lhs_.isAligned() &&
rhs_.isAligned();
446 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
447 (
rows() *
columns() >= SMP_DMATDMATMULT_THRESHOLD );
470 template<
typename MT
480 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
483 else if( rhs.
lhs_.columns() == 0UL ) {
498 DMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
514 template<
typename MT3
517 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
520 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
521 selectSmallAssignKernel( C, A, B );
523 selectBlasAssignKernel( C, A, B );
542 template<
typename MT3
545 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
546 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
548 const size_t M( A.rows() );
549 const size_t N( B.columns() );
550 const size_t K( A.columns() );
552 for(
size_t i=0UL; i<M; ++i )
554 const size_t kbegin( ( IsUpper<MT4>::value )
555 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
557 const size_t kend( ( IsLower<MT4>::value )
558 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
562 if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
563 for(
size_t j=0UL; j<N; ++j ) {
570 const size_t jbegin( ( IsUpper<MT5>::value )
571 ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
573 const size_t jend( ( IsLower<MT5>::value )
574 ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
578 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
579 for(
size_t j=0UL; j<jbegin; ++j ) {
583 else if( IsStrictlyUpper<MT5>::value ) {
586 for(
size_t j=jbegin; j<jend; ++j ) {
587 C(i,j) = A(i,kbegin) * B(kbegin,j);
589 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
590 for(
size_t j=jend; j<N; ++j ) {
594 else if( IsStrictlyLower<MT5>::value ) {
599 for(
size_t k=kbegin+1UL; k<kend; ++k )
601 const size_t jbegin( ( IsUpper<MT5>::value )
602 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
604 const size_t jend( ( IsLower<MT5>::value )
605 ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
609 for(
size_t j=jbegin; j<jend; ++j ) {
610 C(i,j) += A(i,k) * B(k,j);
612 if( IsLower<MT5>::value ) {
613 C(i,jend) = A(i,k) * B(k,jend);
635 template<
typename MT3
638 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
639 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
643 const size_t M( A.rows() );
644 const size_t N( B.columns() );
646 for(
size_t i=0UL; i<M; ++i )
648 const size_t jbegin( ( IsUpper<MT4>::value )
649 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
651 const size_t jend( ( IsLower<MT4>::value )
652 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
656 if( IsUpper<MT4>::value ) {
657 for(
size_t j=0UL; j<jbegin; ++j ) {
661 for(
size_t j=jbegin; j<jend; ++j ) {
662 C(i,j) = A(i,j) * B(j,j);
664 if( IsLower<MT4>::value ) {
665 for(
size_t j=jend; j<N; ++j ) {
688 template<
typename MT3
691 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
692 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
696 const size_t M( A.rows() );
697 const size_t N( B.columns() );
699 for(
size_t i=0UL; i<M; ++i )
701 const size_t jbegin( ( IsUpper<MT5>::value )
702 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
704 const size_t jend( ( IsLower<MT5>::value )
705 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
709 if( IsUpper<MT5>::value ) {
710 for(
size_t j=0UL; j<jbegin; ++j ) {
714 for(
size_t j=jbegin; j<jend; ++j ) {
715 C(i,j) = A(i,i) * B(i,j);
717 if( IsLower<MT5>::value ) {
718 for(
size_t j=jend; j<N; ++j ) {
741 template<
typename MT3
744 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
745 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
751 for(
size_t i=0UL; i<A.rows(); ++i ) {
752 C(i,i) = A(i,i) * B(i,i);
771 template<
typename MT3
774 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
775 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
777 selectDefaultAssignKernel( C, A, B );
797 template<
typename MT3
800 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
801 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
803 const size_t M( A.rows() );
804 const size_t N( B.columns() );
805 const size_t K( A.columns() );
807 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
809 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
814 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
815 for(
size_t i=0UL; i<M; ++i )
817 const size_t kbegin( ( IsUpper<MT4>::value )
818 ?( ( IsLower<MT5>::value )
819 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
820 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
821 :( IsLower<MT5>::value ? j : 0UL ) );
822 const size_t kend( ( IsLower<MT4>::value )
823 ?( ( IsUpper<MT5>::value )
824 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
825 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
826 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*8UL, K ) : K ) );
828 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
830 for(
size_t k=kbegin; k<kend; ++k ) {
831 const SIMDType a1(
set( A(i,k) ) );
832 xmm1 = xmm1 + a1 * B.load(k,j );
833 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
834 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
835 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
836 xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
837 xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
838 xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
839 xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
842 (~C).store( i, j , xmm1 );
843 (~C).store( i, j+SIMDSIZE , xmm2 );
844 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
845 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
846 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
847 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
848 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
849 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
853 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
857 for( ; (i+2UL) <= M; i+=2UL )
859 const size_t kbegin( ( IsUpper<MT4>::value )
860 ?( ( IsLower<MT5>::value )
861 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
862 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
863 :( IsLower<MT5>::value ? j : 0UL ) );
864 const size_t kend( ( IsLower<MT4>::value )
865 ?( ( IsUpper<MT5>::value )
866 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
867 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
868 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*4UL, K ) : K ) );
870 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
872 for(
size_t k=kbegin; k<kend; ++k ) {
873 const SIMDType a1(
set( A(i ,k) ) );
874 const SIMDType a2(
set( A(i+1UL,k) ) );
875 const SIMDType b1( B.load(k,j ) );
876 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
877 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
878 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
879 xmm1 = xmm1 + a1 * b1;
880 xmm2 = xmm2 + a1 * b2;
881 xmm3 = xmm3 + a1 * b3;
882 xmm4 = xmm4 + a1 * b4;
883 xmm5 = xmm5 + a2 * b1;
884 xmm6 = xmm6 + a2 * b2;
885 xmm7 = xmm7 + a2 * b3;
886 xmm8 = xmm8 + a2 * b4;
889 (~C).store( i , j , xmm1 );
890 (~C).store( i , j+SIMDSIZE , xmm2 );
891 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
892 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
893 (~C).store( i+1UL, j , xmm5 );
894 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
895 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
896 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
901 const size_t kbegin( ( IsUpper<MT4>::value )
902 ?( ( IsLower<MT5>::value )
903 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
904 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
905 :( IsLower<MT5>::value ? j : 0UL ) );
906 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, K ) ):( K ) );
908 SIMDType xmm1, xmm2, xmm3, xmm4;
910 for(
size_t k=kbegin; k<kend; ++k ) {
911 const SIMDType a1(
set( A(i,k) ) );
912 xmm1 = xmm1 + a1 * B.load(k,j );
913 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
914 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
915 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
918 (~C).store( i, j , xmm1 );
919 (~C).store( i, j+SIMDSIZE , xmm2 );
920 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
921 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
925 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
929 for( ; (i+2UL) <= M; i+=2UL )
931 const size_t kbegin( ( IsUpper<MT4>::value )
932 ?( ( IsLower<MT5>::value )
933 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
934 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
935 :( IsLower<MT5>::value ? j : 0UL ) );
936 const size_t kend( ( IsLower<MT4>::value )
937 ?( ( IsUpper<MT5>::value )
938 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
939 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
940 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*2UL, K ) : K ) );
942 SIMDType xmm1, xmm2, xmm3, xmm4;
944 for(
size_t k=kbegin; k<kend; ++k ) {
945 const SIMDType a1(
set( A(i ,k) ) );
946 const SIMDType a2(
set( A(i+1UL,k) ) );
947 const SIMDType b1( B.load(k,j ) );
948 const SIMDType b2( B.load(k,j+SIMDSIZE) );
949 xmm1 = xmm1 + a1 * b1;
950 xmm2 = xmm2 + a1 * b2;
951 xmm3 = xmm3 + a2 * b1;
952 xmm4 = xmm4 + a2 * b2;
955 (~C).store( i , j , xmm1 );
956 (~C).store( i , j+SIMDSIZE, xmm2 );
957 (~C).store( i+1UL, j , xmm3 );
958 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
963 const size_t kbegin( ( IsUpper<MT4>::value )
964 ?( ( IsLower<MT5>::value )
965 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
966 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
967 :( IsLower<MT5>::value ? j : 0UL ) );
968 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, K ) ):( K ) );
972 for(
size_t k=kbegin; k<kend; ++k ) {
973 const SIMDType a1(
set( A(i,k) ) );
974 xmm1 = xmm1 + a1 * B.load(k,j );
975 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
978 (~C).store( i, j , xmm1 );
979 (~C).store( i, j+SIMDSIZE, xmm2 );
983 for( ; j<jpos; j+=SIMDSIZE )
987 for( ; (i+2UL) <= M; i+=2UL )
989 const size_t kbegin( ( IsUpper<MT4>::value )
990 ?( ( IsLower<MT5>::value )
991 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
992 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
993 :( IsLower<MT5>::value ? j : 0UL ) );
994 const size_t kend( ( IsLower<MT4>::value )
995 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1000 for(
size_t k=kbegin; k<kend; ++k ) {
1001 const SIMDType b1( B.load(k,j) );
1002 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1003 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1006 (~C).store( i , j, xmm1 );
1007 (~C).store( i+1UL, j, xmm2 );
1012 const size_t kbegin( ( IsUpper<MT4>::value )
1013 ?( ( IsLower<MT5>::value )
1014 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1015 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1016 :( IsLower<MT5>::value ? j : 0UL ) );
1020 for(
size_t k=kbegin; k<K; ++k ) {
1021 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
1024 (~C).store( i, j, xmm1 );
1028 for( ; remainder && j<N; ++j )
1032 for( ; (i+2UL) <= M; i+=2UL )
1034 const size_t kbegin( ( IsUpper<MT4>::value )
1035 ?( ( IsLower<MT5>::value )
1036 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1037 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1038 :( IsLower<MT5>::value ? j : 0UL ) );
1039 const size_t kend( ( IsLower<MT4>::value )
1040 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1046 for(
size_t k=kbegin; k<kend; ++k ) {
1047 value1 += A(i ,k) * B(k,j);
1048 value2 += A(i+1UL,k) * B(k,j);
1051 (~C)(i ,j) = value1;
1052 (~C)(i+1UL,j) = value2;
1057 const size_t kbegin( ( IsUpper<MT4>::value )
1058 ?( ( IsLower<MT5>::value )
1059 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1060 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1061 :( IsLower<MT5>::value ? j : 0UL ) );
1065 for(
size_t k=kbegin; k<K; ++k ) {
1066 value += A(i,k) * B(k,j);
1091 template<
typename MT3
1094 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1095 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1102 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1103 const OppositeType_<MT4> tmp(
serial( A ) );
1104 assign( ~C, tmp * B );
1106 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1107 const OppositeType_<MT5> tmp(
serial( B ) );
1108 assign( ~C, A * tmp );
1110 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1111 const OppositeType_<MT4> tmp(
serial( A ) );
1112 assign( ~C, tmp * B );
1115 const OppositeType_<MT5> tmp(
serial( B ) );
1116 assign( ~C, A * tmp );
1135 template<
typename MT3
1138 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1139 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1141 selectDefaultAssignKernel( C, A, B );
1161 template<
typename MT3
1164 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1165 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1167 const size_t M( A.rows() );
1168 const size_t N( B.columns() );
1169 const size_t K( A.columns() );
1171 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1173 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
1175 const size_t jend(
min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
1177 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1178 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
1180 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
1182 const size_t iend(
min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
1184 for(
size_t i=ii; i<iend; ++i ) {
1185 for(
size_t j=jj; j<jend; ++j ) {
1190 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
1192 const size_t ktmp(
min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
1196 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1198 const size_t j1( j+SIMDSIZE );
1199 const size_t j2( j+SIMDSIZE*2UL );
1200 const size_t j3( j+SIMDSIZE*3UL );
1204 for( ; (i+2UL) <= iend; i+=2UL )
1206 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1207 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1208 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1209 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
1211 SIMDType xmm1( (~C).load(i ,j ) );
1212 SIMDType xmm2( (~C).load(i ,j1) );
1213 SIMDType xmm3( (~C).load(i ,j2) );
1214 SIMDType xmm4( (~C).load(i ,j3) );
1215 SIMDType xmm5( (~C).load(i+1UL,j ) );
1216 SIMDType xmm6( (~C).load(i+1UL,j1) );
1217 SIMDType xmm7( (~C).load(i+1UL,j2) );
1218 SIMDType xmm8( (~C).load(i+1UL,j3) );
1220 for(
size_t k=kbegin; k<kend; ++k ) {
1221 const SIMDType a1(
set( A(i ,k) ) );
1222 const SIMDType a2(
set( A(i+1UL,k) ) );
1223 const SIMDType b1( B.load(k,j ) );
1224 const SIMDType b2( B.load(k,j1) );
1225 const SIMDType b3( B.load(k,j2) );
1226 const SIMDType b4( B.load(k,j3) );
1227 xmm1 = xmm1 + a1 * b1;
1228 xmm2 = xmm2 + a1 * b2;
1229 xmm3 = xmm3 + a1 * b3;
1230 xmm4 = xmm4 + a1 * b4;
1231 xmm5 = xmm5 + a2 * b1;
1232 xmm6 = xmm6 + a2 * b2;
1233 xmm7 = xmm7 + a2 * b3;
1234 xmm8 = xmm8 + a2 * b4;
1237 (~C).store( i , j , xmm1 );
1238 (~C).store( i , j1, xmm2 );
1239 (~C).store( i , j2, xmm3 );
1240 (~C).store( i , j3, xmm4 );
1241 (~C).store( i+1UL, j , xmm5 );
1242 (~C).store( i+1UL, j1, xmm6 );
1243 (~C).store( i+1UL, j2, xmm7 );
1244 (~C).store( i+1UL, j3, xmm8 );
1249 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1250 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1251 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1252 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
1254 SIMDType xmm1( (~C).load(i,j ) );
1255 SIMDType xmm2( (~C).load(i,j1) );
1256 SIMDType xmm3( (~C).load(i,j2) );
1257 SIMDType xmm4( (~C).load(i,j3) );
1259 for(
size_t k=kbegin; k<kend; ++k ) {
1260 const SIMDType a1(
set( A(i,k) ) );
1261 xmm1 = xmm1 + a1 * B.load(k,j );
1262 xmm2 = xmm2 + a1 * B.load(k,j1);
1263 xmm3 = xmm3 + a1 * B.load(k,j2);
1264 xmm4 = xmm4 + a1 * B.load(k,j3);
1267 (~C).store( i, j , xmm1 );
1268 (~C).store( i, j1, xmm2 );
1269 (~C).store( i, j2, xmm3 );
1270 (~C).store( i, j3, xmm4 );
1274 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1276 const size_t j1( j+SIMDSIZE );
1280 for( ; (i+4UL) <= iend; i+=4UL )
1282 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1283 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1284 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
1285 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
1287 SIMDType xmm1( (~C).load(i ,j ) );
1288 SIMDType xmm2( (~C).load(i ,j1) );
1289 SIMDType xmm3( (~C).load(i+1UL,j ) );
1290 SIMDType xmm4( (~C).load(i+1UL,j1) );
1291 SIMDType xmm5( (~C).load(i+2UL,j ) );
1292 SIMDType xmm6( (~C).load(i+2UL,j1) );
1293 SIMDType xmm7( (~C).load(i+3UL,j ) );
1294 SIMDType xmm8( (~C).load(i+3UL,j1) );
1296 for(
size_t k=kbegin; k<kend; ++k ) {
1297 const SIMDType a1(
set( A(i ,k) ) );
1298 const SIMDType a2(
set( A(i+1UL,k) ) );
1299 const SIMDType a3(
set( A(i+2UL,k) ) );
1300 const SIMDType a4(
set( A(i+3UL,k) ) );
1301 const SIMDType b1( B.load(k,j ) );
1302 const SIMDType b2( B.load(k,j1) );
1303 xmm1 = xmm1 + a1 * b1;
1304 xmm2 = xmm2 + a1 * b2;
1305 xmm3 = xmm3 + a2 * b1;
1306 xmm4 = xmm4 + a2 * b2;
1307 xmm5 = xmm5 + a3 * b1;
1308 xmm6 = xmm6 + a3 * b2;
1309 xmm7 = xmm7 + a4 * b1;
1310 xmm8 = xmm8 + a4 * b2;
1313 (~C).store( i , j , xmm1 );
1314 (~C).store( i , j1, xmm2 );
1315 (~C).store( i+1UL, j , xmm3 );
1316 (~C).store( i+1UL, j1, xmm4 );
1317 (~C).store( i+2UL, j , xmm5 );
1318 (~C).store( i+2UL, j1, xmm6 );
1319 (~C).store( i+3UL, j , xmm7 );
1320 (~C).store( i+3UL, j1, xmm8 );
1323 for( ; (i+2UL) <= iend; i+=2UL )
1325 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1326 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1327 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1328 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
1330 SIMDType xmm1( (~C).load(i ,j ) );
1331 SIMDType xmm2( (~C).load(i ,j1) );
1332 SIMDType xmm3( (~C).load(i+1UL,j ) );
1333 SIMDType xmm4( (~C).load(i+1UL,j1) );
1335 for(
size_t k=kbegin; k<kend; ++k ) {
1336 const SIMDType a1(
set( A(i ,k) ) );
1337 const SIMDType a2(
set( A(i+1UL,k) ) );
1338 const SIMDType b1( B.load(k,j ) );
1339 const SIMDType b2( B.load(k,j1) );
1340 xmm1 = xmm1 + a1 * b1;
1341 xmm2 = xmm2 + a1 * b2;
1342 xmm3 = xmm3 + a2 * b1;
1343 xmm4 = xmm4 + a2 * b2;
1346 (~C).store( i , j , xmm1 );
1347 (~C).store( i , j1, xmm2 );
1348 (~C).store( i+1UL, j , xmm3 );
1349 (~C).store( i+1UL, j1, xmm4 );
1354 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1355 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1356 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1357 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
1359 SIMDType xmm1( (~C).load(i,j ) );
1360 SIMDType xmm2( (~C).load(i,j1) );
1362 for(
size_t k=kbegin; k<kend; ++k ) {
1363 const SIMDType a1(
set( A(i,k) ) );
1364 xmm1 = xmm1 + a1 * B.load(k,j );
1365 xmm2 = xmm2 + a1 * B.load(k,j1);
1368 (~C).store( i, j , xmm1 );
1369 (~C).store( i, j1, xmm2 );
1373 for( ; j<jpos; j+=SIMDSIZE )
1375 for(
size_t i=ii; i<iend; ++i )
1377 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1378 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1379 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1380 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
1382 SIMDType xmm1( (~C).load(i,j) );
1384 for(
size_t k=kbegin; k<kend; ++k ) {
1385 const SIMDType a1(
set( A(i,k) ) );
1386 xmm1 = xmm1 + a1 * B.load(k,j);
1389 (~C).store( i, j, xmm1 );
1393 for( ; remainder && j<jend; ++j )
1395 for(
size_t i=ii; i<iend; ++i )
1397 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1398 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1399 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1400 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
1402 ElementType value( (~C)(i,j) );
1404 for(
size_t k=kbegin; k<kend; ++k ) {
1405 value += A(i,k) * B(k,j);
1432 template<
typename MT3
1435 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1436 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1438 selectSmallAssignKernel( ~C, A, B );
1456 template<
typename MT3
1459 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
1460 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1462 selectLargeAssignKernel( C, A, B );
1468 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1481 template<
typename MT3
1484 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
1485 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1487 typedef ElementType_<MT3> ET;
1489 if( IsTriangular<MT4>::value ) {
1491 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1493 else if( IsTriangular<MT5>::value ) {
1495 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1498 gemm( C, A, B, ET(1), ET(0) );
1518 template<
typename MT
1520 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
1525 typedef IfTrue_< SO, OppositeType, ResultType > TmpType;
1537 const TmpType tmp(
serial( rhs ) );
1538 assign( ~lhs, tmp );
1558 template<
typename MT >
1559 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
1569 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
1570 assign( ~lhs,
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) );
1571 else if( IsSymmetric<MT1>::value )
1572 assign( ~lhs,
trans( rhs.lhs_ ) * rhs.rhs_ );
1574 assign( ~lhs, rhs.lhs_ *
trans( rhs.rhs_ ) );
1592 template<
typename MT
1594 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
1602 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1606 LT A(
serial( rhs.lhs_ ) );
1607 RT B(
serial( rhs.rhs_ ) );
1616 DMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1632 template<
typename MT3
1635 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1637 if( ( IsDiagonal<MT5>::value ) ||
1638 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
1639 selectSmallAddAssignKernel( C, A, B );
1641 selectBlasAddAssignKernel( C, A, B );
1660 template<
typename MT3
1663 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
1664 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1666 const size_t M( A.rows() );
1667 const size_t N( B.columns() );
1668 const size_t K( A.columns() );
1670 for(
size_t i=0UL; i<M; ++i )
1672 const size_t kbegin( ( IsUpper<MT4>::value )
1673 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1675 const size_t kend( ( IsLower<MT4>::value )
1676 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1680 for(
size_t k=kbegin; k<kend; ++k )
1682 const size_t jbegin( ( IsUpper<MT5>::value )
1683 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
1685 const size_t jend( ( IsLower<MT5>::value )
1686 ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
1690 const size_t jnum( jend - jbegin );
1691 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1693 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1694 C(i,j ) += A(i,k) * B(k,j );
1695 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1698 C(i,jpos) += A(i,k) * B(k,jpos);
1720 template<
typename MT3
1723 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
1724 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1728 const size_t M( A.rows() );
1729 const size_t N( B.columns() );
1731 for(
size_t i=0UL; i<M; ++i )
1733 const size_t jbegin( ( IsUpper<MT4>::value )
1734 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1736 const size_t jend( ( IsLower<MT4>::value )
1737 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1741 const size_t jnum( jend - jbegin );
1742 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1744 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1745 C(i,j ) += A(i,j ) * B(j ,j );
1746 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1749 C(i,jpos) += A(i,jpos) * B(jpos,jpos);
1770 template<
typename MT3
1773 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
1774 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1778 const size_t M( A.rows() );
1779 const size_t N( B.columns() );
1781 for(
size_t i=0UL; i<M; ++i )
1783 const size_t jbegin( ( IsUpper<MT5>::value )
1784 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
1786 const size_t jend( ( IsLower<MT5>::value )
1787 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
1791 const size_t jnum( jend - jbegin );
1792 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1794 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1795 C(i,j ) += A(i,i) * B(i,j );
1796 C(i,j+1UL) += A(i,i) * B(i,j+1UL);
1799 C(i,jpos) += A(i,i) * B(i,jpos);
1820 template<
typename MT3
1823 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
1824 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1828 for(
size_t i=0UL; i<A.rows(); ++i ) {
1829 C(i,i) += A(i,i) * B(i,i);
1849 template<
typename MT3
1852 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1853 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1855 selectDefaultAddAssignKernel( C, A, B );
1875 template<
typename MT3
1878 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1879 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1881 const size_t M( A.rows() );
1882 const size_t N( B.columns() );
1883 const size_t K( A.columns() );
1885 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1887 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1892 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1893 for(
size_t i=0UL; i<M; ++i )
1895 const size_t kbegin( ( IsUpper<MT4>::value )
1896 ?( ( IsLower<MT5>::value )
1897 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1898 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1899 :( IsLower<MT5>::value ? j : 0UL ) );
1900 const size_t kend( ( IsLower<MT4>::value )
1901 ?( ( IsUpper<MT5>::value )
1902 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
1903 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1904 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*8UL, K ) : K ) );
1906 SIMDType xmm1( (~C).load(i,j ) );
1907 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
1908 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
1909 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
1910 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
1911 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
1912 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
1913 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
1915 for(
size_t k=kbegin; k<kend; ++k ) {
1916 const SIMDType a1(
set( A(i,k) ) );
1917 xmm1 = xmm1 + a1 * B.load(k,j );
1918 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
1919 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
1920 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
1921 xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
1922 xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
1923 xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
1924 xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
1927 (~C).store( i, j , xmm1 );
1928 (~C).store( i, j+SIMDSIZE , xmm2 );
1929 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1930 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1931 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1932 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
1933 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
1934 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
1938 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1942 for( ; (i+2UL) <= M; i+=2UL )
1944 const size_t kbegin( ( IsUpper<MT4>::value )
1945 ?( ( IsLower<MT5>::value )
1946 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1947 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1948 :( IsLower<MT5>::value ? j : 0UL ) );
1949 const size_t kend( ( IsLower<MT4>::value )
1950 ?( ( IsUpper<MT5>::value )
1951 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1952 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1953 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*4UL, K ) : K ) );
1955 SIMDType xmm1( (~C).load(i ,j ) );
1956 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
1957 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
1958 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
1959 SIMDType xmm5( (~C).load(i+1UL,j ) );
1960 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
1961 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
1962 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
1964 for(
size_t k=kbegin; k<kend; ++k ) {
1965 const SIMDType a1(
set( A(i ,k) ) );
1966 const SIMDType a2(
set( A(i+1UL,k) ) );
1967 const SIMDType b1( B.load(k,j ) );
1968 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1969 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1970 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1971 xmm1 = xmm1 + a1 * b1;
1972 xmm2 = xmm2 + a1 * b2;
1973 xmm3 = xmm3 + a1 * b3;
1974 xmm4 = xmm4 + a1 * b4;
1975 xmm5 = xmm5 + a2 * b1;
1976 xmm6 = xmm6 + a2 * b2;
1977 xmm7 = xmm7 + a2 * b3;
1978 xmm8 = xmm8 + a2 * b4;
1981 (~C).store( i , j , xmm1 );
1982 (~C).store( i , j+SIMDSIZE , xmm2 );
1983 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1984 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1985 (~C).store( i+1UL, j , xmm5 );
1986 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1987 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1988 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1993 const size_t kbegin( ( IsUpper<MT4>::value )
1994 ?( ( IsLower<MT5>::value )
1995 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1996 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1997 :( IsLower<MT5>::value ? j : 0UL ) );
1998 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, K ) ):( K ) );
2000 SIMDType xmm1( (~C).load(i,j ) );
2001 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2002 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2003 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2005 for(
size_t k=kbegin; k<kend; ++k ) {
2006 const SIMDType a1(
set( A(i,k) ) );
2007 xmm1 = xmm1 + a1 * B.load(k,j );
2008 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
2009 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
2010 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
2013 (~C).store( i, j , xmm1 );
2014 (~C).store( i, j+SIMDSIZE , xmm2 );
2015 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2016 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2020 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2024 for( ; (i+2UL) <= M; i+=2UL )
2026 const size_t kbegin( ( IsUpper<MT4>::value )
2027 ?( ( IsLower<MT5>::value )
2028 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2029 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2030 :( IsLower<MT5>::value ? j : 0UL ) );
2031 const size_t kend( ( IsLower<MT4>::value )
2032 ?( ( IsUpper<MT5>::value )
2033 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
2034 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2035 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*2UL, K ) : K ) );
2037 SIMDType xmm1( (~C).load(i ,j ) );
2038 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2039 SIMDType xmm3( (~C).load(i+1UL,j ) );
2040 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2042 for(
size_t k=kbegin; k<kend; ++k ) {
2043 const SIMDType a1(
set( A(i ,k) ) );
2044 const SIMDType a2(
set( A(i+1UL,k) ) );
2045 const SIMDType b1( B.load(k,j ) );
2046 const SIMDType b2( B.load(k,j+SIMDSIZE) );
2047 xmm1 = xmm1 + a1 * b1;
2048 xmm2 = xmm2 + a1 * b2;
2049 xmm3 = xmm3 + a2 * b1;
2050 xmm4 = xmm4 + a2 * b2;
2053 (~C).store( i , j , xmm1 );
2054 (~C).store( i , j+SIMDSIZE, xmm2 );
2055 (~C).store( i+1UL, j , xmm3 );
2056 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
2061 const size_t kbegin( ( IsUpper<MT4>::value )
2062 ?( ( IsLower<MT5>::value )
2063 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2064 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2065 :( IsLower<MT5>::value ? j : 0UL ) );
2066 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, K ) ):( K ) );
2068 SIMDType xmm1( (~C).load(i,j ) );
2069 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
2071 for(
size_t k=kbegin; k<kend; ++k ) {
2072 const SIMDType a1(
set( A(i,k) ) );
2073 xmm1 = xmm1 + a1 * B.load(k,j );
2074 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
2077 (~C).store( i, j , xmm1 );
2078 (~C).store( i, j+SIMDSIZE, xmm2 );
2082 for( ; j<jpos; j+=SIMDSIZE )
2086 for( ; (i+2UL) <= M; i+=2UL )
2088 const size_t kbegin( ( IsUpper<MT4>::value )
2089 ?( ( IsLower<MT5>::value )
2090 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2091 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2092 :( IsLower<MT5>::value ? j : 0UL ) );
2093 const size_t kend( ( IsLower<MT4>::value )
2094 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2097 SIMDType xmm1( (~C).load(i ,j) );
2098 SIMDType xmm2( (~C).load(i+1UL,j) );
2100 for(
size_t k=kbegin; k<kend; ++k ) {
2101 const SIMDType b1( B.load(k,j) );
2102 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2103 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2106 (~C).store( i , j, xmm1 );
2107 (~C).store( i+1UL, j, xmm2 );
2112 const size_t kbegin( ( IsUpper<MT4>::value )
2113 ?( ( IsLower<MT5>::value )
2114 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2115 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2116 :( IsLower<MT5>::value ? j : 0UL ) );
2118 SIMDType xmm1( (~C).load(i,j) );
2120 for(
size_t k=kbegin; k<K; ++k ) {
2121 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
2124 (~C).store( i, j, xmm1 );
2128 for( ; remainder && j<N; ++j )
2132 for( ; (i+2UL) <= M; i+=2UL )
2134 const size_t kbegin( ( IsUpper<MT4>::value )
2135 ?( ( IsLower<MT5>::value )
2136 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2137 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2138 :( IsLower<MT5>::value ? j : 0UL ) );
2139 const size_t kend( ( IsLower<MT4>::value )
2140 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2143 ElementType value1( (~C)(i ,j) );
2144 ElementType value2( (~C)(i+1UL,j) );;
2146 for(
size_t k=kbegin; k<kend; ++k ) {
2147 value1 += A(i ,k) * B(k,j);
2148 value2 += A(i+1UL,k) * B(k,j);
2151 (~C)(i ,j) = value1;
2152 (~C)(i+1UL,j) = value2;
2157 const size_t kbegin( ( IsUpper<MT4>::value )
2158 ?( ( IsLower<MT5>::value )
2159 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2160 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2161 :( IsLower<MT5>::value ? j : 0UL ) );
2163 ElementType value( (~C)(i,j) );
2165 for(
size_t k=kbegin; k<K; ++k ) {
2166 value += A(i,k) * B(k,j);
2191 template<
typename MT3
2194 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2195 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2202 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2203 const OppositeType_<MT4> tmp(
serial( A ) );
2204 addAssign( ~C, tmp * B );
2206 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2207 const OppositeType_<MT5> tmp(
serial( B ) );
2208 addAssign( ~C, A * tmp );
2210 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2211 const OppositeType_<MT4> tmp(
serial( A ) );
2212 addAssign( ~C, tmp * B );
2215 const OppositeType_<MT5> tmp(
serial( B ) );
2216 addAssign( ~C, A * tmp );
2236 template<
typename MT3
2239 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2240 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2242 selectDefaultAddAssignKernel( C, A, B );
2262 template<
typename MT3
2265 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2266 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2268 const size_t M( A.rows() );
2269 const size_t N( B.columns() );
2270 const size_t K( A.columns() );
2272 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2274 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
2276 const size_t jend(
min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
2278 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2279 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
2281 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
2283 const size_t iend(
min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
2285 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
2287 const size_t ktmp(
min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
2291 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2293 const size_t j1( j+SIMDSIZE );
2294 const size_t j2( j+SIMDSIZE*2UL );
2295 const size_t j3( j+SIMDSIZE*3UL );
2299 for( ; (i+2UL) <= iend; i+=2UL )
2301 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2302 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2303 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
2304 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
2306 SIMDType xmm1( (~C).load(i ,j ) );
2307 SIMDType xmm2( (~C).load(i ,j1) );
2308 SIMDType xmm3( (~C).load(i ,j2) );
2309 SIMDType xmm4( (~C).load(i ,j3) );
2310 SIMDType xmm5( (~C).load(i+1UL,j ) );
2311 SIMDType xmm6( (~C).load(i+1UL,j1) );
2312 SIMDType xmm7( (~C).load(i+1UL,j2) );
2313 SIMDType xmm8( (~C).load(i+1UL,j3) );
2315 for(
size_t k=kbegin; k<kend; ++k ) {
2316 const SIMDType a1(
set( A(i ,k) ) );
2317 const SIMDType a2(
set( A(i+1UL,k) ) );
2318 const SIMDType b1( B.load(k,j ) );
2319 const SIMDType b2( B.load(k,j1) );
2320 const SIMDType b3( B.load(k,j2) );
2321 const SIMDType b4( B.load(k,j3) );
2322 xmm1 = xmm1 + a1 * b1;
2323 xmm2 = xmm2 + a1 * b2;
2324 xmm3 = xmm3 + a1 * b3;
2325 xmm4 = xmm4 + a1 * b4;
2326 xmm5 = xmm5 + a2 * b1;
2327 xmm6 = xmm6 + a2 * b2;
2328 xmm7 = xmm7 + a2 * b3;
2329 xmm8 = xmm8 + a2 * b4;
2332 (~C).store( i , j , xmm1 );
2333 (~C).store( i , j1, xmm2 );
2334 (~C).store( i , j2, xmm3 );
2335 (~C).store( i , j3, xmm4 );
2336 (~C).store( i+1UL, j , xmm5 );
2337 (~C).store( i+1UL, j1, xmm6 );
2338 (~C).store( i+1UL, j2, xmm7 );
2339 (~C).store( i+1UL, j3, xmm8 );
2344 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2345 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2346 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2347 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
2349 SIMDType xmm1( (~C).load(i,j ) );
2350 SIMDType xmm2( (~C).load(i,j1) );
2351 SIMDType xmm3( (~C).load(i,j2) );
2352 SIMDType xmm4( (~C).load(i,j3) );
2354 for(
size_t k=kbegin; k<kend; ++k ) {
2355 const SIMDType a1(
set( A(i,k) ) );
2356 xmm1 = xmm1 + a1 * B.load(k,j );
2357 xmm2 = xmm2 + a1 * B.load(k,j1);
2358 xmm3 = xmm3 + a1 * B.load(k,j2);
2359 xmm4 = xmm4 + a1 * B.load(k,j3);
2362 (~C).store( i, j , xmm1 );
2363 (~C).store( i, j1, xmm2 );
2364 (~C).store( i, j2, xmm3 );
2365 (~C).store( i, j3, xmm4 );
2369 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2371 const size_t j1( j+SIMDSIZE );
2375 for( ; (i+4UL) <= iend; i+=4UL )
2377 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2378 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2379 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
2380 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
2382 SIMDType xmm1( (~C).load(i ,j ) );
2383 SIMDType xmm2( (~C).load(i ,j1) );
2384 SIMDType xmm3( (~C).load(i+1UL,j ) );
2385 SIMDType xmm4( (~C).load(i+1UL,j1) );
2386 SIMDType xmm5( (~C).load(i+2UL,j ) );
2387 SIMDType xmm6( (~C).load(i+2UL,j1) );
2388 SIMDType xmm7( (~C).load(i+3UL,j ) );
2389 SIMDType xmm8( (~C).load(i+3UL,j1) );
2391 for(
size_t k=kbegin; k<kend; ++k ) {
2392 const SIMDType a1(
set( A(i ,k) ) );
2393 const SIMDType a2(
set( A(i+1UL,k) ) );
2394 const SIMDType a3(
set( A(i+2UL,k) ) );
2395 const SIMDType a4(
set( A(i+3UL,k) ) );
2396 const SIMDType b1( B.load(k,j ) );
2397 const SIMDType b2( B.load(k,j1) );
2398 xmm1 = xmm1 + a1 * b1;
2399 xmm2 = xmm2 + a1 * b2;
2400 xmm3 = xmm3 + a2 * b1;
2401 xmm4 = xmm4 + a2 * b2;
2402 xmm5 = xmm5 + a3 * b1;
2403 xmm6 = xmm6 + a3 * b2;
2404 xmm7 = xmm7 + a4 * b1;
2405 xmm8 = xmm8 + a4 * b2;
2408 (~C).store( i , j , xmm1 );
2409 (~C).store( i , j1, xmm2 );
2410 (~C).store( i+1UL, j , xmm3 );
2411 (~C).store( i+1UL, j1, xmm4 );
2412 (~C).store( i+2UL, j , xmm5 );
2413 (~C).store( i+2UL, j1, xmm6 );
2414 (~C).store( i+3UL, j , xmm7 );
2415 (~C).store( i+3UL, j1, xmm8 );
2418 for( ; (i+2UL) <= iend; i+=2UL )
2420 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2421 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2422 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
2423 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
2425 SIMDType xmm1( (~C).load(i ,j ) );
2426 SIMDType xmm2( (~C).load(i ,j1) );
2427 SIMDType xmm3( (~C).load(i+1UL,j ) );
2428 SIMDType xmm4( (~C).load(i+1UL,j1) );
2430 for(
size_t k=kbegin; k<kend; ++k ) {
2431 const SIMDType a1(
set( A(i ,k) ) );
2432 const SIMDType a2(
set( A(i+1UL,k) ) );
2433 const SIMDType b1( B.load(k,j ) );
2434 const SIMDType b2( B.load(k,j1) );
2435 xmm1 = xmm1 + a1 * b1;
2436 xmm2 = xmm2 + a1 * b2;
2437 xmm3 = xmm3 + a2 * b1;
2438 xmm4 = xmm4 + a2 * b2;
2441 (~C).store( i , j , xmm1 );
2442 (~C).store( i , j1, xmm2 );
2443 (~C).store( i+1UL, j , xmm3 );
2444 (~C).store( i+1UL, j1, xmm4 );
2449 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2450 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2451 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2452 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
2454 SIMDType xmm1( (~C).load(i,j ) );
2455 SIMDType xmm2( (~C).load(i,j1) );
2457 for(
size_t k=kbegin; k<kend; ++k ) {
2458 const SIMDType a1(
set( A(i,k) ) );
2459 xmm1 = xmm1 + a1 * B.load(k,j );
2460 xmm2 = xmm2 + a1 * B.load(k,j1);
2463 (~C).store( i, j , xmm1 );
2464 (~C).store( i, j1, xmm2 );
2468 for( ; j<jpos; j+=SIMDSIZE )
2470 for(
size_t i=ii; i<iend; ++i )
2472 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2473 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2474 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2475 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
2477 SIMDType xmm1( (~C).load(i,j) );
2479 for(
size_t k=kbegin; k<kend; ++k ) {
2480 const SIMDType a1(
set( A(i,k) ) );
2481 xmm1 = xmm1 + a1 * B.load(k,j);
2484 (~C).store( i, j, xmm1 );
2488 for( ; remainder && j<jend; ++j )
2490 for(
size_t i=ii; i<iend; ++i )
2492 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2493 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2494 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2495 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
2497 ElementType value( (~C)(i,j) );
2499 for(
size_t k=kbegin; k<kend; ++k ) {
2500 value += A(i,k) * B(k,j);
2527 template<
typename MT3
2530 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2531 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2533 selectSmallAddAssignKernel( ~C, A, B );
2552 template<
typename MT3
2555 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
2556 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2558 selectLargeAddAssignKernel( C, A, B );
2564 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2578 template<
typename MT3
2581 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
2582 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2584 typedef ElementType_<MT3> ET;
2586 if( IsTriangular<MT4>::value ) {
2587 ResultType_<MT3> tmp(
serial( B ) );
2588 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2589 addAssign( C, tmp );
2591 else if( IsTriangular<MT5>::value ) {
2592 ResultType_<MT3> tmp(
serial( A ) );
2593 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2594 addAssign( C, tmp );
2597 gemm( C, A, B, ET(1), ET(1) );
2619 template<
typename MT >
2620 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
2630 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
2631 addAssign( ~lhs,
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) );
2632 else if( IsSymmetric<MT1>::value )
2633 addAssign( ~lhs,
trans( rhs.lhs_ ) * rhs.rhs_ );
2635 addAssign( ~lhs, rhs.lhs_ *
trans( rhs.rhs_ ) );
2657 template<
typename MT
2659 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
2667 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2671 LT A(
serial( rhs.lhs_ ) );
2672 RT B(
serial( rhs.rhs_ ) );
2681 DMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2697 template<
typename MT3
2700 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2702 if( ( IsDiagonal<MT5>::value ) ||
2703 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
2704 selectSmallSubAssignKernel( C, A, B );
2706 selectBlasSubAssignKernel( C, A, B );
2725 template<
typename MT3
2728 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2729 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2731 const size_t M( A.rows() );
2732 const size_t N( B.columns() );
2733 const size_t K( A.columns() );
2735 for(
size_t i=0UL; i<M; ++i )
2737 const size_t kbegin( ( IsUpper<MT4>::value )
2738 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2740 const size_t kend( ( IsLower<MT4>::value )
2741 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2745 for(
size_t k=kbegin; k<kend; ++k )
2747 const size_t jbegin( ( IsUpper<MT5>::value )
2748 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
2750 const size_t jend( ( IsLower<MT5>::value )
2751 ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
2755 const size_t jnum( jend - jbegin );
2756 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2758 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2759 C(i,j ) -= A(i,k) * B(k,j );
2760 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
2763 C(i,jpos) -= A(i,k) * B(k,jpos);
2785 template<
typename MT3
2788 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2789 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2793 const size_t M( A.rows() );
2794 const size_t N( B.columns() );
2796 for(
size_t i=0UL; i<M; ++i )
2798 const size_t jbegin( ( IsUpper<MT4>::value )
2799 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2801 const size_t jend( ( IsLower<MT4>::value )
2802 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2806 const size_t jnum( jend - jbegin );
2807 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2809 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2810 C(i,j ) -= A(i,j ) * B(j ,j );
2811 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
2814 C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
2835 template<
typename MT3
2838 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2839 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2843 const size_t M( A.rows() );
2844 const size_t N( B.columns() );
2846 for(
size_t i=0UL; i<M; ++i )
2848 const size_t jbegin( ( IsUpper<MT5>::value )
2849 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2851 const size_t jend( ( IsLower<MT5>::value )
2852 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2856 const size_t jnum( jend - jbegin );
2857 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2859 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2860 C(i,j ) -= A(i,i) * B(i,j );
2861 C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
2864 C(i,jpos) -= A(i,i) * B(i,jpos);
2885 template<
typename MT3
2888 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2889 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2893 for(
size_t i=0UL; i<A.rows(); ++i ) {
2894 C(i,i) -= A(i,i) * B(i,i);
2914 template<
typename MT3
2917 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2918 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2920 selectDefaultSubAssignKernel( C, A, B );
2940 template<
typename MT3
2943 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2944 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2946 const size_t M( A.rows() );
2947 const size_t N( B.columns() );
2948 const size_t K( A.columns() );
2950 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2952 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
2957 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
2958 for(
size_t i=0UL; i<M; ++i )
2960 const size_t kbegin( ( IsUpper<MT4>::value )
2961 ?( ( IsLower<MT5>::value )
2962 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2963 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2964 :( IsLower<MT5>::value ? j : 0UL ) );
2965 const size_t kend( ( IsLower<MT4>::value )
2966 ?( ( IsUpper<MT5>::value )
2967 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
2968 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2969 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*8UL, K ) : K ) );
2971 SIMDType xmm1( (~C).load(i,j ) );
2972 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2973 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2974 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2975 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2976 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
2977 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
2978 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
2980 for(
size_t k=kbegin; k<kend; ++k ) {
2981 const SIMDType a1(
set( A(i,k) ) );
2982 xmm1 = xmm1 - a1 * B.load(k,j );
2983 xmm2 = xmm2 - a1 * B.load(k,j+SIMDSIZE );
2984 xmm3 = xmm3 - a1 * B.load(k,j+SIMDSIZE*2UL);
2985 xmm4 = xmm4 - a1 * B.load(k,j+SIMDSIZE*3UL);
2986 xmm5 = xmm5 - a1 * B.load(k,j+SIMDSIZE*4UL);
2987 xmm6 = xmm6 - a1 * B.load(k,j+SIMDSIZE*5UL);
2988 xmm7 = xmm7 - a1 * B.load(k,j+SIMDSIZE*6UL);
2989 xmm8 = xmm8 - a1 * B.load(k,j+SIMDSIZE*7UL);
2992 (~C).store( i, j , xmm1 );
2993 (~C).store( i, j+SIMDSIZE , xmm2 );
2994 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2995 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2996 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2997 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
2998 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
2999 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
3003 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3007 for( ; (i+2UL) <= M; i+=2UL )
3009 const size_t kbegin( ( IsUpper<MT4>::value )
3010 ?( ( IsLower<MT5>::value )
3011 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3012 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3013 :( IsLower<MT5>::value ? j : 0UL ) );
3014 const size_t kend( ( IsLower<MT4>::value )
3015 ?( ( IsUpper<MT5>::value )
3016 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
3017 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3018 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*4UL, K ) : K ) );
3020 SIMDType xmm1( (~C).load(i ,j ) );
3021 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3022 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3023 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
3024 SIMDType xmm5( (~C).load(i+1UL,j ) );
3025 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
3026 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3027 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3029 for(
size_t k=kbegin; k<kend; ++k ) {
3030 const SIMDType a1(
set( A(i ,k) ) );
3031 const SIMDType a2(
set( A(i+1UL,k) ) );
3032 const SIMDType b1( B.load(k,j ) );
3033 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3034 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3035 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3036 xmm1 = xmm1 - a1 * b1;
3037 xmm2 = xmm2 - a1 * b2;
3038 xmm3 = xmm3 - a1 * b3;
3039 xmm4 = xmm4 - a1 * b4;
3040 xmm5 = xmm5 - a2 * b1;
3041 xmm6 = xmm6 - a2 * b2;
3042 xmm7 = xmm7 - a2 * b3;
3043 xmm8 = xmm8 - a2 * b4;
3046 (~C).store( i , j , xmm1 );
3047 (~C).store( i , j+SIMDSIZE , xmm2 );
3048 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3049 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3050 (~C).store( i+1UL, j , xmm5 );
3051 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
3052 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3053 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3058 const size_t kbegin( ( IsUpper<MT4>::value )
3059 ?( ( IsLower<MT5>::value )
3060 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3061 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3062 :( IsLower<MT5>::value ? j : 0UL ) );
3063 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, K ) ):( K ) );
3065 SIMDType xmm1( (~C).load(i,j ) );
3066 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3067 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3068 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3070 for(
size_t k=kbegin; k<kend; ++k ) {
3071 const SIMDType a1(
set( A(i,k) ) );
3072 xmm1 = xmm1 - a1 * B.load(k,j );
3073 xmm2 = xmm2 - a1 * B.load(k,j+SIMDSIZE );
3074 xmm3 = xmm3 - a1 * B.load(k,j+SIMDSIZE*2UL);
3075 xmm4 = xmm4 - a1 * B.load(k,j+SIMDSIZE*3UL);
3078 (~C).store( i, j , xmm1 );
3079 (~C).store( i, j+SIMDSIZE , xmm2 );
3080 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3081 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3085 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3089 for( ; (i+2UL) <= M; i+=2UL )
3091 const size_t kbegin( ( IsUpper<MT4>::value )
3092 ?( ( IsLower<MT5>::value )
3093 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3094 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3095 :( IsLower<MT5>::value ? j : 0UL ) );
3096 const size_t kend( ( IsLower<MT4>::value )
3097 ?( ( IsUpper<MT5>::value )
3098 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
3099 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3100 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*2UL, K ) : K ) );
3102 SIMDType xmm1( (~C).load(i ,j ) );
3103 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3104 SIMDType xmm3( (~C).load(i+1UL,j ) );
3105 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3107 for(
size_t k=kbegin; k<kend; ++k ) {
3108 const SIMDType a1(
set( A(i ,k) ) );
3109 const SIMDType a2(
set( A(i+1UL,k) ) );
3110 const SIMDType b1( B.load(k,j ) );
3111 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3112 xmm1 = xmm1 - a1 * b1;
3113 xmm2 = xmm2 - a1 * b2;
3114 xmm3 = xmm3 - a2 * b1;
3115 xmm4 = xmm4 - a2 * b2;
3118 (~C).store( i , j , xmm1 );
3119 (~C).store( i , j+SIMDSIZE, xmm2 );
3120 (~C).store( i+1UL, j , xmm3 );
3121 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3126 const size_t kbegin( ( IsUpper<MT4>::value )
3127 ?( ( IsLower<MT5>::value )
3128 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3129 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3130 :( IsLower<MT5>::value ? j : 0UL ) );
3131 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, K ) ):( K ) );
3133 SIMDType xmm1( (~C).load(i,j ) );
3134 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
3136 for(
size_t k=kbegin; k<kend; ++k ) {
3137 const SIMDType a1(
set( A(i,k) ) );
3138 xmm1 = xmm1 - a1 * B.load(k,j );
3139 xmm2 = xmm2 - a1 * B.load(k,j+SIMDSIZE);
3142 (~C).store( i, j , xmm1 );
3143 (~C).store( i, j+SIMDSIZE, xmm2 );
3147 for( ; j<jpos; j+=SIMDSIZE )
3151 for( ; (i+2UL) <= M; i+=2UL )
3153 const size_t kbegin( ( IsUpper<MT4>::value )
3154 ?( ( IsLower<MT5>::value )
3155 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3156 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3157 :( IsLower<MT5>::value ? j : 0UL ) );
3158 const size_t kend( ( IsLower<MT4>::value )
3159 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3162 SIMDType xmm1( (~C).load(i ,j) );
3163 SIMDType xmm2( (~C).load(i+1UL,j) );
3165 for(
size_t k=kbegin; k<kend; ++k ) {
3166 const SIMDType b1( B.load(k,j) );
3167 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
3168 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
3171 (~C).store( i , j, xmm1 );
3172 (~C).store( i+1UL, j, xmm2 );
3177 const size_t kbegin( ( IsUpper<MT4>::value )
3178 ?( ( IsLower<MT5>::value )
3179 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3180 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3181 :( IsLower<MT5>::value ? j : 0UL ) );
3183 SIMDType xmm1( (~C).load(i,j) );
3185 for(
size_t k=kbegin; k<K; ++k ) {
3186 xmm1 = xmm1 -
set( A(i,k) ) * B.load(k,j);
3189 (~C).store( i, j, xmm1 );
3193 for( ; remainder && j<N; ++j )
3197 for( ; (i+2UL) <= M; i+=2UL )
3199 const size_t kbegin( ( IsUpper<MT4>::value )
3200 ?( ( IsLower<MT5>::value )
3201 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3202 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3203 :( IsLower<MT5>::value ? j : 0UL ) );
3204 const size_t kend( ( IsLower<MT4>::value )
3205 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3208 ElementType value1( (~C)(i ,j) );
3209 ElementType value2( (~C)(i+1UL,j) );
3211 for(
size_t k=kbegin; k<kend; ++k ) {
3212 value1 -= A(i ,k) * B(k,j);
3213 value2 -= A(i+1UL,k) * B(k,j);
3216 (~C)(i ,j) = value1;
3217 (~C)(i+1UL,j) = value2;
3222 const size_t kbegin( ( IsUpper<MT4>::value )
3223 ?( ( IsLower<MT5>::value )
3224 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3225 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3226 :( IsLower<MT5>::value ? j : 0UL ) );
3228 ElementType value( (~C)(i,j) );
3230 for(
size_t k=kbegin; k<K; ++k ) {
3231 value -= A(i,k) * B(k,j);
3256 template<
typename MT3
3259 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3260 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3267 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
3268 const OppositeType_<MT4> tmp(
serial( A ) );
3269 subAssign( ~C, tmp * B );
3271 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
3272 const OppositeType_<MT5> tmp(
serial( B ) );
3273 subAssign( ~C, A * tmp );
3275 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
3276 const OppositeType_<MT4> tmp(
serial( A ) );
3277 subAssign( ~C, tmp * B );
3280 const OppositeType_<MT5> tmp(
serial( B ) );
3281 subAssign( ~C, A * tmp );
3301 template<
typename MT3
3304 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3305 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3307 selectDefaultSubAssignKernel( C, A, B );
3327 template<
typename MT3
3330 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3331 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3333 const size_t M( A.rows() );
3334 const size_t N( B.columns() );
3335 const size_t K( A.columns() );
3337 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3339 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
3341 const size_t jend(
min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
3343 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3344 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
3346 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
3348 const size_t iend(
min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
3350 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
3352 const size_t ktmp(
min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
3356 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3358 const size_t j1( j+SIMDSIZE );
3359 const size_t j2( j+SIMDSIZE*2UL );
3360 const size_t j3( j+SIMDSIZE*3UL );
3364 for( ; (i+2UL) <= iend; i+=2UL )
3366 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3367 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3368 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3369 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
3371 SIMDType xmm1( (~C).load(i ,j ) );
3372 SIMDType xmm2( (~C).load(i ,j1) );
3373 SIMDType xmm3( (~C).load(i ,j2) );
3374 SIMDType xmm4( (~C).load(i ,j3) );
3375 SIMDType xmm5( (~C).load(i+1UL,j ) );
3376 SIMDType xmm6( (~C).load(i+1UL,j1) );
3377 SIMDType xmm7( (~C).load(i+1UL,j2) );
3378 SIMDType xmm8( (~C).load(i+1UL,j3) );
3380 for(
size_t k=kbegin; k<kend; ++k ) {
3381 const SIMDType a1(
set( A(i ,k) ) );
3382 const SIMDType a2(
set( A(i+1UL,k) ) );
3383 const SIMDType b1( B.load(k,j ) );
3384 const SIMDType b2( B.load(k,j1) );
3385 const SIMDType b3( B.load(k,j2) );
3386 const SIMDType b4( B.load(k,j3) );
3387 xmm1 = xmm1 - a1 * b1;
3388 xmm2 = xmm2 - a1 * b2;
3389 xmm3 = xmm3 - a1 * b3;
3390 xmm4 = xmm4 - a1 * b4;
3391 xmm5 = xmm5 - a2 * b1;
3392 xmm6 = xmm6 - a2 * b2;
3393 xmm7 = xmm7 - a2 * b3;
3394 xmm8 = xmm8 - a2 * b4;
3397 (~C).store( i , j , xmm1 );
3398 (~C).store( i , j1, xmm2 );
3399 (~C).store( i , j2, xmm3 );
3400 (~C).store( i , j3, xmm4 );
3401 (~C).store( i+1UL, j , xmm5 );
3402 (~C).store( i+1UL, j1, xmm6 );
3403 (~C).store( i+1UL, j2, xmm7 );
3404 (~C).store( i+1UL, j3, xmm8 );
3409 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3410 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3411 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3412 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
3414 SIMDType xmm1( (~C).load(i,j ) );
3415 SIMDType xmm2( (~C).load(i,j1) );
3416 SIMDType xmm3( (~C).load(i,j2) );
3417 SIMDType xmm4( (~C).load(i,j3) );
3419 for(
size_t k=kbegin; k<kend; ++k ) {
3420 const SIMDType a1(
set( A(i,k) ) );
3421 xmm1 = xmm1 - a1 * B.load(k,j );
3422 xmm2 = xmm2 - a1 * B.load(k,j1);
3423 xmm3 = xmm3 - a1 * B.load(k,j2);
3424 xmm4 = xmm4 - a1 * B.load(k,j3);
3427 (~C).store( i, j , xmm1 );
3428 (~C).store( i, j1, xmm2 );
3429 (~C).store( i, j2, xmm3 );
3430 (~C).store( i, j3, xmm4 );
3434 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3436 const size_t j1( j+SIMDSIZE );
3440 for( ; (i+4UL) <= iend; i+=4UL )
3442 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3443 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3444 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
3445 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
3447 SIMDType xmm1( (~C).load(i ,j ) );
3448 SIMDType xmm2( (~C).load(i ,j1) );
3449 SIMDType xmm3( (~C).load(i+1UL,j ) );
3450 SIMDType xmm4( (~C).load(i+1UL,j1) );
3451 SIMDType xmm5( (~C).load(i+2UL,j ) );
3452 SIMDType xmm6( (~C).load(i+2UL,j1) );
3453 SIMDType xmm7( (~C).load(i+3UL,j ) );
3454 SIMDType xmm8( (~C).load(i+3UL,j1) );
3456 for(
size_t k=kbegin; k<kend; ++k ) {
3457 const SIMDType a1(
set( A(i ,k) ) );
3458 const SIMDType a2(
set( A(i+1UL,k) ) );
3459 const SIMDType a3(
set( A(i+2UL,k) ) );
3460 const SIMDType a4(
set( A(i+3UL,k) ) );
3461 const SIMDType b1( B.load(k,j ) );
3462 const SIMDType b2( B.load(k,j1) );
3463 xmm1 = xmm1 - a1 * b1;
3464 xmm2 = xmm2 - a1 * b2;
3465 xmm3 = xmm3 - a2 * b1;
3466 xmm4 = xmm4 - a2 * b2;
3467 xmm5 = xmm5 - a3 * b1;
3468 xmm6 = xmm6 - a3 * b2;
3469 xmm7 = xmm7 - a4 * b1;
3470 xmm8 = xmm8 - a4 * b2;
3473 (~C).store( i , j , xmm1 );
3474 (~C).store( i , j1, xmm2 );
3475 (~C).store( i+1UL, j , xmm3 );
3476 (~C).store( i+1UL, j1, xmm4 );
3477 (~C).store( i+2UL, j , xmm5 );
3478 (~C).store( i+2UL, j1, xmm6 );
3479 (~C).store( i+3UL, j , xmm7 );
3480 (~C).store( i+3UL, j1, xmm8 );
3483 for( ; (i+2UL) <= iend; i+=2UL )
3485 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3486 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3487 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3488 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
3490 SIMDType xmm1( (~C).load(i ,j ) );
3491 SIMDType xmm2( (~C).load(i ,j1) );
3492 SIMDType xmm3( (~C).load(i+1UL,j ) );
3493 SIMDType xmm4( (~C).load(i+1UL,j1) );
3495 for(
size_t k=kbegin; k<kend; ++k ) {
3496 const SIMDType a1(
set( A(i ,k) ) );
3497 const SIMDType a2(
set( A(i+1UL,k) ) );
3498 const SIMDType b1( B.load(k,j ) );
3499 const SIMDType b2( B.load(k,j1) );
3500 xmm1 = xmm1 - a1 * b1;
3501 xmm2 = xmm2 - a1 * b2;
3502 xmm3 = xmm3 - a2 * b1;
3503 xmm4 = xmm4 - a2 * b2;
3506 (~C).store( i , j , xmm1 );
3507 (~C).store( i , j1, xmm2 );
3508 (~C).store( i+1UL, j , xmm3 );
3509 (~C).store( i+1UL, j1, xmm4 );
3514 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3515 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3516 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3517 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
3519 SIMDType xmm1( (~C).load(i,j ) );
3520 SIMDType xmm2( (~C).load(i,j1) );
3522 for(
size_t k=kbegin; k<kend; ++k ) {
3523 const SIMDType a1(
set( A(i,k) ) );
3524 xmm1 = xmm1 - a1 * B.load(k,j );
3525 xmm2 = xmm2 - a1 * B.load(k,j1);
3528 (~C).store( i, j , xmm1 );
3529 (~C).store( i, j1, xmm2 );
3533 for( ; j<jpos; j+=SIMDSIZE )
3535 for(
size_t i=ii; i<iend; ++i )
3537 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3538 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3539 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3540 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
3542 SIMDType xmm1( (~C).load(i,j) );
3544 for(
size_t k=kbegin; k<kend; ++k ) {
3545 const SIMDType a1(
set( A(i,k) ) );
3546 xmm1 = xmm1 - a1 * B.load(k,j);
3549 (~C).store( i, j, xmm1 );
3553 for( ; remainder && j<jend; ++j )
3555 for(
size_t i=ii; i<iend; ++i )
3557 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3558 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3559 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3560 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
3562 ElementType value( (~C)(i,j) );
3564 for(
size_t k=kbegin; k<kend; ++k ) {
3565 value -= A(i,k) * B(k,j);
3592 template<
typename MT3
3595 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3596 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3598 selectSmallSubAssignKernel( ~C, A, B );
3617 template<
typename MT3
3620 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
3621 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3623 selectLargeSubAssignKernel( C, A, B );
3629 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3643 template<
typename MT3
3646 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
3647 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3649 typedef ElementType_<MT3> ET;
3651 if( IsTriangular<MT4>::value ) {
3652 ResultType_<MT3> tmp(
serial( B ) );
3653 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3654 subAssign( C, tmp );
3656 else if( IsTriangular<MT5>::value ) {
3657 ResultType_<MT3> tmp(
serial( A ) );
3658 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3659 subAssign( C, tmp );
3662 gemm( C, A, B, ET(-1), ET(1) );
3684 template<
typename MT >
3685 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3695 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3696 subAssign( ~lhs,
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) );
3697 else if( IsSymmetric<MT1>::value )
3698 subAssign( ~lhs,
trans( rhs.lhs_ ) * rhs.rhs_ );
3700 subAssign( ~lhs, rhs.lhs_ *
trans( rhs.rhs_ ) );
3732 template<
typename MT
3734 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3742 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
3745 else if( rhs.lhs_.columns() == 0UL ) {
3780 template<
typename MT
3782 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3787 typedef IfTrue_< SO, OppositeType, ResultType > TmpType;
3799 const TmpType tmp( rhs );
3820 template<
typename MT >
3821 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3831 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3833 else if( IsSymmetric<MT1>::value )
3857 template<
typename MT
3859 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3867 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3901 template<
typename MT >
3902 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3912 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3914 else if( IsSymmetric<MT1>::value )
3942 template<
typename MT
3944 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3952 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3986 template<
typename MT >
3987 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3997 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3999 else if( IsSymmetric<MT1>::value )
4048 template<
typename MT1
4052 :
public DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2>, ST, false >, false >
4053 ,
private MatScalarMultExpr
4054 ,
private Computation
4058 typedef DMatDMatMultExpr<MT1,MT2> MMM;
4059 typedef ResultType_<MMM> RES;
4060 typedef ResultType_<MT1>
RT1;
4061 typedef ResultType_<MT2>
RT2;
4062 typedef ElementType_<RT1>
ET1;
4063 typedef ElementType_<RT2>
ET2;
4064 typedef CompositeType_<MT1>
CT1;
4065 typedef CompositeType_<MT2>
CT2;
4070 enum :
bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4075 enum :
bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4085 template<
typename T1,
typename T2,
typename T3 >
4086 struct CanExploitSymmetry {
4087 enum :
bool { value = IsColumnMajorMatrix<T1>::value &&
4088 ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
4097 template<
typename T1,
typename T2,
typename T3 >
4098 struct IsEvaluationRequired {
4099 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
4100 !CanExploitSymmetry<T1,T2,T3>::value };
4108 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4109 struct UseBlasKernel {
4111 HasMutableDataAccess<T1>::value &&
4112 HasConstDataAccess<T2>::value &&
4113 HasConstDataAccess<T3>::value &&
4114 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4115 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4116 IsBLASCompatible< ElementType_<T1> >::value &&
4117 IsBLASCompatible< ElementType_<T2> >::value &&
4118 IsBLASCompatible< ElementType_<T3> >::value &&
4119 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
4120 IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
4121 !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
4129 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4130 struct UseVectorizedDefaultKernel {
4132 !IsDiagonal<T3>::value &&
4133 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4134 AreSIMDCombinable< ElementType_<T1>
4138 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
4139 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
4145 typedef DMatScalarMultExpr<MMM,ST,false>
This;
4150 typedef SIMDTrait_<ElementType>
SIMDType;
4155 typedef const DMatDMatMultExpr<MT1,MT2>
LeftOperand;
4161 typedef IfTrue_< evaluateLeft, const RT1, CT1 >
LT;
4164 typedef IfTrue_< evaluateRight, const RT2, CT2 >
RT;
4169 enum :
bool { simdEnabled = !IsDiagonal<MT2>::value &&
4170 MT1::simdEnabled && MT2::simdEnabled &&
4171 AreSIMDCombinable<ET1,ET2,ST>::value &&
4172 HasSIMDAdd<ET1,ET2>::value &&
4173 HasSIMDMult<ET1,ET2>::value };
4176 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4177 !evaluateRight && MT2::smpAssignable };
4191 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
4204 inline ReturnType
operator()(
size_t i,
size_t j )
const {
4207 return matrix_(i,j) * scalar_;
4219 inline ReturnType
at(
size_t i,
size_t j )
const {
4220 if( i >= matrix_.rows() ) {
4223 if( j >= matrix_.columns() ) {
4226 return (*
this)(i,j);
4235 inline size_t rows()
const {
4236 return matrix_.rows();
4245 inline size_t columns()
const {
4246 return matrix_.columns();
4276 template<
typename T >
4277 inline bool canAlias(
const T* alias )
const {
4278 return matrix_.canAlias( alias );
4288 template<
typename T >
4289 inline bool isAliased(
const T* alias )
const {
4290 return matrix_.isAliased( alias );
4300 return matrix_.isAligned();
4311 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
4312 (
rows() *
columns() >= SMP_DMATDMATMULT_THRESHOLD );
4318 LeftOperand matrix_;
4319 RightOperand scalar_;
4334 template<
typename MT
4336 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
4337 assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4344 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
4345 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
4347 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4350 else if( left.columns() == 0UL ) {
4365 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4380 template<
typename MT3
4384 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4386 if( ( IsDiagonal<MT5>::value ) ||
4387 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
4388 selectSmallAssignKernel( C, A, B, scalar );
4390 selectBlasAssignKernel( C, A, B, scalar );
4408 template<
typename MT3
4412 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4413 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4415 const size_t M( A.rows() );
4416 const size_t N( B.columns() );
4417 const size_t K( A.columns() );
4419 for(
size_t i=0UL; i<M; ++i )
4421 const size_t kbegin( ( IsUpper<MT4>::value )
4422 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4424 const size_t kend( ( IsLower<MT4>::value )
4425 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4429 if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
4430 for(
size_t j=0UL; j<N; ++j ) {
4437 const size_t jbegin( ( IsUpper<MT5>::value )
4438 ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
4440 const size_t jend( ( IsLower<MT5>::value )
4441 ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
4445 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
4446 for(
size_t j=0UL; j<jbegin; ++j ) {
4450 else if( IsStrictlyUpper<MT5>::value ) {
4453 for(
size_t j=jbegin; j<jend; ++j ) {
4454 C(i,j) = A(i,kbegin) * B(kbegin,j);
4456 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
4457 for(
size_t j=jend; j<N; ++j ) {
4461 else if( IsStrictlyLower<MT5>::value ) {
4462 reset( C(i,N-1UL) );
4466 for(
size_t k=kbegin+1UL; k<kend; ++k )
4468 const size_t jbegin( ( IsUpper<MT5>::value )
4469 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
4471 const size_t jend( ( IsLower<MT5>::value )
4472 ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
4476 for(
size_t j=jbegin; j<jend; ++j ) {
4477 C(i,j) += A(i,k) * B(k,j);
4479 if( IsLower<MT5>::value ) {
4480 C(i,jend) = A(i,k) * B(k,jend);
4485 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4486 ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? i+1UL : i )
4488 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
4489 ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? i : i+1UL )
4493 for(
size_t j=jbegin; j<jend; ++j ) {
4515 template<
typename MT3
4519 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4520 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4524 const size_t M( A.rows() );
4525 const size_t N( B.columns() );
4527 for(
size_t i=0UL; i<M; ++i )
4529 const size_t jbegin( ( IsUpper<MT4>::value )
4530 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4532 const size_t jend( ( IsLower<MT4>::value )
4533 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4537 if( IsUpper<MT4>::value ) {
4538 for(
size_t j=0UL; j<jbegin; ++j ) {
4542 for(
size_t j=jbegin; j<jend; ++j ) {
4543 C(i,j) = A(i,j) * B(j,j) * scalar;
4545 if( IsLower<MT4>::value ) {
4546 for(
size_t j=jend; j<N; ++j ) {
4568 template<
typename MT3
4572 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4573 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4577 const size_t M( A.rows() );
4578 const size_t N( B.columns() );
4580 for(
size_t i=0UL; i<M; ++i )
4582 const size_t jbegin( ( IsUpper<MT5>::value )
4583 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4585 const size_t jend( ( IsLower<MT5>::value )
4586 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4590 if( IsUpper<MT5>::value ) {
4591 for(
size_t j=0UL; j<jbegin; ++j ) {
4595 for(
size_t j=jbegin; j<jend; ++j ) {
4596 C(i,j) = A(i,i) * B(i,j) * scalar;
4598 if( IsLower<MT5>::value ) {
4599 for(
size_t j=jend; j<N; ++j ) {
4621 template<
typename MT3
4625 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
4626 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4632 for(
size_t i=0UL; i<A.rows(); ++i ) {
4633 C(i,i) = A(i,i) * B(i,i) * scalar;
4652 template<
typename MT3
4656 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4657 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4659 selectDefaultAssignKernel( C, A, B, scalar );
4678 template<
typename MT3
4682 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4683 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4685 const size_t M( A.rows() );
4686 const size_t N( B.columns() );
4687 const size_t K( A.columns() );
4689 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
4691 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
4694 const SIMDType factor(
set( scalar ) );
4698 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
4699 for(
size_t i=0UL; i<M; ++i )
4701 const size_t kbegin( ( IsUpper<MT4>::value )
4702 ?( ( IsLower<MT5>::value )
4703 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4704 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4705 :( IsLower<MT5>::value ? j : 0UL ) );
4706 const size_t kend( ( IsLower<MT4>::value )
4707 ?( ( IsUpper<MT5>::value )
4708 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
4709 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4710 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*8UL, K ) : K ) );
4712 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4714 for(
size_t k=kbegin; k<kend; ++k ) {
4715 const SIMDType a1(
set( A(i,k) ) );
4716 xmm1 = xmm1 + a1 * B.load(k,j );
4717 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
4718 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
4719 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
4720 xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
4721 xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
4722 xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
4723 xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
4726 (~C).store( i, j , xmm1 * factor );
4727 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
4728 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
4729 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
4730 (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
4731 (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
4732 (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
4733 (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
4737 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4741 for( ; (i+2UL) <= M; i+=2UL )
4743 const size_t kbegin( ( IsUpper<MT4>::value )
4744 ?( ( IsLower<MT5>::value )
4745 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4746 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4747 :( IsLower<MT5>::value ? j : 0UL ) );
4748 const size_t kend( ( IsLower<MT4>::value )
4749 ?( ( IsUpper<MT5>::value )
4750 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
4751 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4752 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*4UL, K ) : K ) );
4754 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4756 for(
size_t k=kbegin; k<kend; ++k ) {
4757 const SIMDType a1(
set( A(i ,k) ) );
4758 const SIMDType a2(
set( A(i+1UL,k) ) );
4759 const SIMDType b1( B.load(k,j ) );
4760 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4761 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4762 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
4763 xmm1 = xmm1 + a1 * b1;
4764 xmm2 = xmm2 + a1 * b2;
4765 xmm3 = xmm3 + a1 * b3;
4766 xmm4 = xmm4 + a1 * b4;
4767 xmm5 = xmm5 + a2 * b1;
4768 xmm6 = xmm6 + a2 * b2;
4769 xmm7 = xmm7 + a2 * b3;
4770 xmm8 = xmm8 + a2 * b4;
4773 (~C).store( i , j , xmm1 * factor );
4774 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
4775 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
4776 (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
4777 (~C).store( i+1UL, j , xmm5 * factor );
4778 (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
4779 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
4780 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
4785 const size_t kbegin( ( IsUpper<MT4>::value )
4786 ?( ( IsLower<MT5>::value )
4787 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4788 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4789 :( IsLower<MT5>::value ? j : 0UL ) );
4790 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, K ) ):( K ) );
4792 SIMDType xmm1, xmm2, xmm3, xmm4;
4794 for(
size_t k=kbegin; k<kend; ++k ) {
4795 const SIMDType a1(
set( A(i,k) ) );
4796 xmm1 = xmm1 + a1 * B.load(k,j );
4797 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
4798 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
4799 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
4802 (~C).store( i, j , xmm1 * factor );
4803 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
4804 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
4805 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
4809 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4813 for( ; (i+2UL) <= M; i+=2UL )
4815 const size_t kbegin( ( IsUpper<MT4>::value )
4816 ?( ( IsLower<MT5>::value )
4817 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4818 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4819 :( IsLower<MT5>::value ? j : 0UL ) );
4820 const size_t kend( ( IsLower<MT4>::value )
4821 ?( ( IsUpper<MT5>::value )
4822 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
4823 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4824 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*2UL, K ) : K ) );
4826 SIMDType xmm1, xmm2, xmm3, xmm4;
4828 for(
size_t k=kbegin; k<kend; ++k ) {
4829 const SIMDType a1(
set( A(i ,k) ) );
4830 const SIMDType a2(
set( A(i+1UL,k) ) );
4831 const SIMDType b1( B.load(k,j ) );
4832 const SIMDType b2( B.load(k,j+SIMDSIZE) );
4833 xmm1 = xmm1 + a1 * b1;
4834 xmm2 = xmm2 + a1 * b2;
4835 xmm3 = xmm3 + a2 * b1;
4836 xmm4 = xmm4 + a2 * b2;
4839 (~C).store( i , j , xmm1 * factor );
4840 (~C).store( i , j+SIMDSIZE, xmm2 * factor );
4841 (~C).store( i+1UL, j , xmm3 * factor );
4842 (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
4847 const size_t kbegin( ( IsUpper<MT4>::value )
4848 ?( ( IsLower<MT5>::value )
4849 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4850 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4851 :( IsLower<MT5>::value ? j : 0UL ) );
4852 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, K ) ):( K ) );
4854 SIMDType xmm1, xmm2;
4856 for(
size_t k=kbegin; k<kend; ++k ) {
4857 const SIMDType a1(
set( A(i,k) ) );
4858 xmm1 = xmm1 + a1 * B.load(k,j );
4859 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
4862 (~C).store( i, j , xmm1 * factor );
4863 (~C).store( i, j+SIMDSIZE, xmm2 * factor );
4867 for( ; j<jpos; j+=SIMDSIZE )
4871 for( ; (i+2UL) <= M; i+=2UL )
4873 const size_t kbegin( ( IsUpper<MT4>::value )
4874 ?( ( IsLower<MT5>::value )
4875 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4876 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4877 :( IsLower<MT5>::value ? j : 0UL ) );
4878 const size_t kend( ( IsLower<MT4>::value )
4879 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4882 SIMDType xmm1, xmm2;
4884 for(
size_t k=kbegin; k<kend; ++k ) {
4885 const SIMDType b1( B.load(k,j) );
4886 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
4887 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
4890 (~C).store( i , j, xmm1 * factor );
4891 (~C).store( i+1UL, j, xmm2 * factor );
4896 const size_t kbegin( ( IsUpper<MT4>::value )
4897 ?( ( IsLower<MT5>::value )
4898 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4899 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4900 :( IsLower<MT5>::value ? j : 0UL ) );
4904 for(
size_t k=kbegin; k<K; ++k ) {
4905 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
4908 (~C).store( i, j, xmm1 * factor );
4912 for( ; remainder && j<N; ++j )
4916 for( ; (i+2UL) <= M; i+=2UL )
4918 const size_t kbegin( ( IsUpper<MT4>::value )
4919 ?( ( IsLower<MT5>::value )
4920 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4921 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4922 :( IsLower<MT5>::value ? j : 0UL ) );
4923 const size_t kend( ( IsLower<MT4>::value )
4924 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4930 for(
size_t k=kbegin; k<kend; ++k ) {
4931 value1 += A(i ,k) * B(k,j);
4932 value2 += A(i+1UL,k) * B(k,j);
4935 (~C)(i ,j) = value1 * scalar;
4936 (~C)(i+1UL,j) = value2 * scalar;
4941 const size_t kbegin( ( IsUpper<MT4>::value )
4942 ?( ( IsLower<MT5>::value )
4943 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4944 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4945 :( IsLower<MT5>::value ? j : 0UL ) );
4949 for(
size_t k=kbegin; k<K; ++k ) {
4950 value += A(i,k) * B(k,j);
4953 (~C)(i,j) = value * scalar;
4974 template<
typename MT3
4978 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4979 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4986 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
4987 const OppositeType_<MT4> tmp(
serial( A ) );
4988 assign( ~C, tmp * B * scalar );
4990 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
4991 const OppositeType_<MT5> tmp(
serial( B ) );
4992 assign( ~C, A * tmp * scalar );
4994 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
4995 const OppositeType_<MT4> tmp(
serial( A ) );
4996 assign( ~C, tmp * B * scalar );
4999 const OppositeType_<MT5> tmp(
serial( B ) );
5000 assign( ~C, A * tmp * scalar );
5019 template<
typename MT3
5023 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5024 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5026 selectDefaultAssignKernel( C, A, B, scalar );
5045 template<
typename MT3
5049 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5050 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5052 const size_t M( A.rows() );
5053 const size_t N( B.columns() );
5054 const size_t K( A.columns() );
5056 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5058 const SIMDType factor(
set( scalar ) );
5060 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
5062 const size_t jend(
min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
5064 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
5065 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
5067 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
5069 const size_t iend(
min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
5071 for(
size_t i=ii; i<iend; ++i ) {
5072 for(
size_t j=jj; j<jend; ++j ) {
5077 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
5079 const size_t ktmp(
min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
5083 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5085 const size_t j1( j+SIMDSIZE );
5086 const size_t j2( j+SIMDSIZE*2UL );
5087 const size_t j3( j+SIMDSIZE*3UL );
5091 for( ; (i+2UL) <= iend; i+=2UL )
5093 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5094 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5095 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5096 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
5098 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5100 for(
size_t k=kbegin; k<kend; ++k ) {
5101 const SIMDType a1(
set( A(i ,k) ) );
5102 const SIMDType a2(
set( A(i+1UL,k) ) );
5103 const SIMDType b1( B.load(k,j ) );
5104 const SIMDType b2( B.load(k,j1) );
5105 const SIMDType b3( B.load(k,j2) );
5106 const SIMDType b4( B.load(k,j3) );
5107 xmm1 = xmm1 + a1 * b1;
5108 xmm2 = xmm2 + a1 * b2;
5109 xmm3 = xmm3 + a1 * b3;
5110 xmm4 = xmm4 + a1 * b4;
5111 xmm5 = xmm5 + a2 * b1;
5112 xmm6 = xmm6 + a2 * b2;
5113 xmm7 = xmm7 + a2 * b3;
5114 xmm8 = xmm8 + a2 * b4;
5117 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5118 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
5119 (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
5120 (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
5121 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
5122 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
5123 (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
5124 (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
5129 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5130 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5131 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5132 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
5134 SIMDType xmm1, xmm2, xmm3, xmm4;
5136 for(
size_t k=kbegin; k<kend; ++k ) {
5137 const SIMDType a1(
set( A(i,k) ) );
5138 xmm1 = xmm1 + a1 * B.load(k,j );
5139 xmm2 = xmm2 + a1 * B.load(k,j1);
5140 xmm3 = xmm3 + a1 * B.load(k,j2);
5141 xmm4 = xmm4 + a1 * B.load(k,j3);
5144 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5145 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
5146 (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
5147 (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
5151 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5153 const size_t j1( j+SIMDSIZE );
5157 for( ; (i+4UL) <= iend; i+=4UL )
5159 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5160 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5161 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
5162 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
5164 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5166 for(
size_t k=kbegin; k<kend; ++k ) {
5167 const SIMDType a1(
set( A(i ,k) ) );
5168 const SIMDType a2(
set( A(i+1UL,k) ) );
5169 const SIMDType a3(
set( A(i+2UL,k) ) );
5170 const SIMDType a4(
set( A(i+3UL,k) ) );
5171 const SIMDType b1( B.load(k,j ) );
5172 const SIMDType b2( B.load(k,j1) );
5173 xmm1 = xmm1 + a1 * b1;
5174 xmm2 = xmm2 + a1 * b2;
5175 xmm3 = xmm3 + a2 * b1;
5176 xmm4 = xmm4 + a2 * b2;
5177 xmm5 = xmm5 + a3 * b1;
5178 xmm6 = xmm6 + a3 * b2;
5179 xmm7 = xmm7 + a4 * b1;
5180 xmm8 = xmm8 + a4 * b2;
5183 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5184 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
5185 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5186 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
5187 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
5188 (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
5189 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
5190 (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
5193 for( ; (i+2UL) <= iend; i+=2UL )
5195 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5196 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5197 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5198 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
5200 SIMDType xmm1, xmm2, xmm3, xmm4;
5202 for(
size_t k=kbegin; k<kend; ++k ) {
5203 const SIMDType a1(
set( A(i ,k) ) );
5204 const SIMDType a2(
set( A(i+1UL,k) ) );
5205 const SIMDType b1( B.load(k,j ) );
5206 const SIMDType b2( B.load(k,j1) );
5207 xmm1 = xmm1 + a1 * b1;
5208 xmm2 = xmm2 + a1 * b2;
5209 xmm3 = xmm3 + a2 * b1;
5210 xmm4 = xmm4 + a2 * b2;
5213 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5214 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
5215 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5216 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
5221 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5222 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5223 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5224 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
5226 SIMDType xmm1, xmm2;
5228 for(
size_t k=kbegin; k<kend; ++k ) {
5229 const SIMDType a1(
set( A(i,k) ) );
5230 xmm1 = xmm1 + a1 * B.load(k,j );
5231 xmm2 = xmm2 + a1 * B.load(k,j1);
5234 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5235 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
5239 for( ; j<jpos; j+=SIMDSIZE )
5241 for(
size_t i=ii; i<iend; ++i )
5243 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5244 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5245 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5246 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
5250 for(
size_t k=kbegin; k<kend; ++k ) {
5251 const SIMDType a1(
set( A(i,k) ) );
5252 xmm1 = xmm1 + a1 * B.load(k,j);
5255 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5259 for( ; remainder && j<jend; ++j )
5261 for(
size_t i=ii; i<iend; ++i )
5263 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5264 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5265 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5266 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
5270 for(
size_t k=kbegin; k<kend; ++k ) {
5271 value += A(i,k) * B(k,j);
5274 (~C)(i,j) += value * scalar;
5297 template<
typename MT3
5301 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5302 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5304 selectSmallAssignKernel( ~C, A, B, scalar );
5322 template<
typename MT3
5326 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
5327 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5329 selectLargeAssignKernel( C, A, B, scalar );
5334 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
5348 template<
typename MT3
5352 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
5353 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5355 typedef ElementType_<MT3> ET;
5357 if( IsTriangular<MT4>::value ) {
5359 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5361 else if( IsTriangular<MT5>::value ) {
5363 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5366 gemm( C, A, B, ET(scalar), ET(0) );
5384 template<
typename MT
5386 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
5387 assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5391 typedef IfTrue_< SO, OppositeType, ResultType > TmpType;
5403 const TmpType tmp(
serial( rhs ) );
5404 assign( ~lhs, tmp );
5422 template<
typename MT >
5423 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
5424 assign( Matrix<MT,true>& lhs,
const DMatScalarMultExpr& rhs )
5433 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
5434 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
5436 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
5437 assign( ~lhs,
trans( left ) *
trans( right ) * rhs.scalar_ );
5438 else if( IsSymmetric<MT1>::value )
5439 assign( ~lhs,
trans( left ) * right * rhs.scalar_ );
5441 assign( ~lhs, left *
trans( right ) * rhs.scalar_ );
5457 template<
typename MT
5459 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
5460 addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5467 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
5468 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
5470 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5484 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5499 template<
typename MT3
5503 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5505 if( ( IsDiagonal<MT5>::value ) ||
5506 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
5507 selectSmallAddAssignKernel( C, A, B, scalar );
5509 selectBlasAddAssignKernel( C, A, B, scalar );
5527 template<
typename MT3
5531 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
5532 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5534 const ResultType tmp(
serial( A * B * scalar ) );
5535 addAssign( C, tmp );
5553 template<
typename MT3
5557 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
5558 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5562 const size_t M( A.rows() );
5563 const size_t N( B.columns() );
5565 for(
size_t i=0UL; i<M; ++i )
5567 const size_t jbegin( ( IsUpper<MT4>::value )
5568 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
5570 const size_t jend( ( IsLower<MT4>::value )
5571 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
5575 const size_t jnum( jend - jbegin );
5576 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
5578 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5579 C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
5580 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
5583 C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
5603 template<
typename MT3
5607 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
5608 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5612 const size_t M( A.rows() );
5613 const size_t N( B.columns() );
5615 for(
size_t i=0UL; i<M; ++i )
5617 const size_t jbegin( ( IsUpper<MT5>::value )
5618 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
5620 const size_t jend( ( IsLower<MT5>::value )
5621 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
5625 const size_t jnum( jend - jbegin );
5626 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
5628 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5629 C(i,j ) += A(i,i) * B(i,j ) * scalar;
5630 C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
5633 C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
5653 template<
typename MT3
5657 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
5658 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5662 for(
size_t i=0UL; i<A.rows(); ++i ) {
5663 C(i,i) += A(i,i) * B(i,i) * scalar;
5682 template<
typename MT3
5686 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5687 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5689 selectDefaultAddAssignKernel( C, A, B, scalar );
5708 template<
typename MT3
5712 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5713 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5715 const size_t M( A.rows() );
5716 const size_t N( B.columns() );
5717 const size_t K( A.columns() );
5719 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5721 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
5724 const SIMDType factor(
set( scalar ) );
5728 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5729 for(
size_t i=0UL; i<M; ++i )
5731 const size_t kbegin( ( IsUpper<MT4>::value )
5732 ?( ( IsLower<MT5>::value )
5733 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5734 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5735 :( IsLower<MT5>::value ? j : 0UL ) );
5736 const size_t kend( ( IsLower<MT4>::value )
5737 ?( ( IsUpper<MT5>::value )
5738 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
5739 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
5740 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*8UL, K ) : K ) );
5742 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5744 for(
size_t k=kbegin; k<kend; ++k ) {
5745 const SIMDType a1(
set( A(i,k) ) );
5746 xmm1 = xmm1 + a1 * B.load(k,j );
5747 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
5748 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
5749 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
5750 xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
5751 xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
5752 xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
5753 xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
5756 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5757 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
5758 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
5759 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
5760 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
5761 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
5762 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
5763 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
5767 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5771 for( ; (i+2UL) <= M; i+=2UL )
5773 const size_t kbegin( ( IsUpper<MT4>::value )
5774 ?( ( IsLower<MT5>::value )
5775 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5776 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5777 :( IsLower<MT5>::value ? j : 0UL ) );
5778 const size_t kend( ( IsLower<MT4>::value )
5779 ?( ( IsUpper<MT5>::value )
5780 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
5781 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5782 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*4UL, K ) : K ) );
5784 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5786 for(
size_t k=kbegin; k<kend; ++k ) {
5787 const SIMDType a1(
set( A(i ,k) ) );
5788 const SIMDType a2(
set( A(i+1UL,k) ) );
5789 const SIMDType b1( B.load(k,j ) );
5790 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5791 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5792 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5793 xmm1 = xmm1 + a1 * b1;
5794 xmm2 = xmm2 + a1 * b2;
5795 xmm3 = xmm3 + a1 * b3;
5796 xmm4 = xmm4 + a1 * b4;
5797 xmm5 = xmm5 + a2 * b1;
5798 xmm6 = xmm6 + a2 * b2;
5799 xmm7 = xmm7 + a2 * b3;
5800 xmm8 = xmm8 + a2 * b4;
5803 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5804 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
5805 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
5806 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
5807 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
5808 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
5809 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
5810 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
5815 const size_t kbegin( ( IsUpper<MT4>::value )
5816 ?( ( IsLower<MT5>::value )
5817 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5818 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5819 :( IsLower<MT5>::value ? j : 0UL ) );
5820 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, K ) ):( K ) );
5822 SIMDType xmm1, xmm2, xmm3, xmm4;
5824 for(
size_t k=kbegin; k<kend; ++k ) {
5825 const SIMDType a1(
set( A(i,k) ) );
5826 xmm1 = xmm1 + a1 * B.load(k,j );
5827 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
5828 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
5829 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
5832 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5833 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
5834 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
5835 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
5839 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5843 for( ; (i+2UL) <= M; i+=2UL )
5845 const size_t kbegin( ( IsUpper<MT4>::value )
5846 ?( ( IsLower<MT5>::value )
5847 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5848 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5849 :( IsLower<MT5>::value ? j : 0UL ) );
5850 const size_t kend( ( IsLower<MT4>::value )
5851 ?( ( IsUpper<MT5>::value )
5852 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
5853 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5854 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*2UL, K ) : K ) );
5856 SIMDType xmm1, xmm2, xmm3, xmm4;
5858 for(
size_t k=kbegin; k<kend; ++k ) {
5859 const SIMDType a1(
set( A(i ,k) ) );
5860 const SIMDType a2(
set( A(i+1UL,k) ) );
5861 const SIMDType b1( B.load(k,j ) );
5862 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5863 xmm1 = xmm1 + a1 * b1;
5864 xmm2 = xmm2 + a1 * b2;
5865 xmm3 = xmm3 + a2 * b1;
5866 xmm4 = xmm4 + a2 * b2;
5869 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5870 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
5871 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5872 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
5877 const size_t kbegin( ( IsUpper<MT4>::value )
5878 ?( ( IsLower<MT5>::value )
5879 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5880 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5881 :( IsLower<MT5>::value ? j : 0UL ) );
5882 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, K ) ):( K ) );
5884 SIMDType xmm1, xmm2;
5886 for(
size_t k=kbegin; k<kend; ++k ) {
5887 const SIMDType a1(
set( A(i,k) ) );
5888 xmm1 = xmm1 + a1 * B.load(k,j );
5889 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
5892 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5893 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + xmm2 * factor );
5897 for( ; j<jpos; j+=SIMDSIZE )
5901 for( ; (i+2UL) <= M; i+=2UL )
5903 const size_t kbegin( ( IsUpper<MT4>::value )
5904 ?( ( IsLower<MT5>::value )
5905 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5906 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5907 :( IsLower<MT5>::value ? j : 0UL ) );
5908 const size_t kend( ( IsLower<MT4>::value )
5909 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5912 SIMDType xmm1, xmm2;
5914 for(
size_t k=kbegin; k<kend; ++k ) {
5915 const SIMDType b1( B.load(k,j) );
5916 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
5917 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
5920 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5921 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
5926 const size_t kbegin( ( IsUpper<MT4>::value )
5927 ?( ( IsLower<MT5>::value )
5928 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5929 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5930 :( IsLower<MT5>::value ? j : 0UL ) );
5934 for(
size_t k=kbegin; k<K; ++k ) {
5935 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
5938 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5942 for( ; remainder && j<N; ++j )
5946 for( ; (i+2UL) <= M; i+=2UL )
5948 const size_t kbegin( ( IsUpper<MT4>::value )
5949 ?( ( IsLower<MT5>::value )
5950 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5951 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5952 :( IsLower<MT5>::value ? j : 0UL ) );
5953 const size_t kend( ( IsLower<MT4>::value )
5954 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5960 for(
size_t k=kbegin; k<kend; ++k ) {
5961 value1 += A(i ,k) * B(k,j);
5962 value2 += A(i+1UL,k) * B(k,j);
5965 (~C)(i ,j) += value1 * scalar;
5966 (~C)(i+1UL,j) += value2 * scalar;
5971 const size_t kbegin( ( IsUpper<MT4>::value )
5972 ?( ( IsLower<MT5>::value )
5973 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5974 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5975 :( IsLower<MT5>::value ? j : 0UL ) );
5979 for(
size_t k=kbegin; k<K; ++k ) {
5980 value += A(i,k) * B(k,j);
5983 (~C)(i,j) += value * scalar;
6004 template<
typename MT3
6008 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6009 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6016 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
6017 const OppositeType_<MT4> tmp(
serial( A ) );
6018 addAssign( ~C, tmp * B * scalar );
6020 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
6021 const OppositeType_<MT5> tmp(
serial( B ) );
6022 addAssign( ~C, A * tmp * scalar );
6024 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6025 const OppositeType_<MT4> tmp(
serial( A ) );
6026 addAssign( ~C, tmp * B * scalar );
6029 const OppositeType_<MT5> tmp(
serial( B ) );
6030 addAssign( ~C, A * tmp * scalar );
6049 template<
typename MT3
6053 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6054 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6056 selectDefaultAddAssignKernel( C, A, B, scalar );
6075 template<
typename MT3
6079 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6080 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6082 const size_t M( A.rows() );
6083 const size_t N( B.columns() );
6084 const size_t K( A.columns() );
6086 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6088 const SIMDType factor(
set( scalar ) );
6090 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
6092 const size_t jend(
min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
6094 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
6095 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
6097 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
6099 const size_t iend(
min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
6101 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
6103 const size_t ktmp(
min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
6107 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6109 const size_t j1( j+SIMDSIZE );
6110 const size_t j2( j+SIMDSIZE*2UL );
6111 const size_t j3( j+SIMDSIZE*3UL );
6115 for( ; (i+2UL) <= iend; i+=2UL )
6117 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6118 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6119 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
6120 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
6122 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6124 for(
size_t k=kbegin; k<kend; ++k ) {
6125 const SIMDType a1(
set( A(i ,k) ) );
6126 const SIMDType a2(
set( A(i+1UL,k) ) );
6127 const SIMDType b1( B.load(k,j ) );
6128 const SIMDType b2( B.load(k,j1) );
6129 const SIMDType b3( B.load(k,j2) );
6130 const SIMDType b4( B.load(k,j3) );
6131 xmm1 = xmm1 + a1 * b1;
6132 xmm2 = xmm2 + a1 * b2;
6133 xmm3 = xmm3 + a1 * b3;
6134 xmm4 = xmm4 + a1 * b4;
6135 xmm5 = xmm5 + a2 * b1;
6136 xmm6 = xmm6 + a2 * b2;
6137 xmm7 = xmm7 + a2 * b3;
6138 xmm8 = xmm8 + a2 * b4;
6141 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6142 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
6143 (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
6144 (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
6145 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
6146 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
6147 (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
6148 (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
6153 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6154 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6155 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6156 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
6158 SIMDType xmm1, xmm2, xmm3, xmm4;
6160 for(
size_t k=kbegin; k<kend; ++k ) {
6161 const SIMDType a1(
set( A(i,k) ) );
6162 xmm1 = xmm1 + a1 * B.load(k,j );
6163 xmm2 = xmm2 + a1 * B.load(k,j1);
6164 xmm3 = xmm3 + a1 * B.load(k,j2);
6165 xmm4 = xmm4 + a1 * B.load(k,j3);
6168 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6169 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
6170 (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
6171 (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
6175 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6177 const size_t j1( j+SIMDSIZE );
6181 for( ; (i+4UL) <= iend; i+=4UL )
6183 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6184 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6185 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
6186 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
6188 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6190 for(
size_t k=kbegin; k<kend; ++k ) {
6191 const SIMDType a1(
set( A(i ,k) ) );
6192 const SIMDType a2(
set( A(i+1UL,k) ) );
6193 const SIMDType a3(
set( A(i+2UL,k) ) );
6194 const SIMDType a4(
set( A(i+3UL,k) ) );
6195 const SIMDType b1( B.load(k,j ) );
6196 const SIMDType b2( B.load(k,j1) );
6197 xmm1 = xmm1 + a1 * b1;
6198 xmm2 = xmm2 + a1 * b2;
6199 xmm3 = xmm3 + a2 * b1;
6200 xmm4 = xmm4 + a2 * b2;
6201 xmm5 = xmm5 + a3 * b1;
6202 xmm6 = xmm6 + a3 * b2;
6203 xmm7 = xmm7 + a4 * b1;
6204 xmm8 = xmm8 + a4 * b2;
6207 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6208 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
6209 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
6210 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
6211 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
6212 (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
6213 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
6214 (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
6217 for( ; (i+2UL) <= iend; i+=2UL )
6219 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6220 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6221 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
6222 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
6224 SIMDType xmm1, xmm2, xmm3, xmm4;
6226 for(
size_t k=kbegin; k<kend; ++k ) {
6227 const SIMDType a1(
set( A(i ,k) ) );
6228 const SIMDType a2(
set( A(i+1UL,k) ) );
6229 const SIMDType b1( B.load(k,j ) );
6230 const SIMDType b2( B.load(k,j1) );
6231 xmm1 = xmm1 + a1 * b1;
6232 xmm2 = xmm2 + a1 * b2;
6233 xmm3 = xmm3 + a2 * b1;
6234 xmm4 = xmm4 + a2 * b2;
6237 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6238 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
6239 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
6240 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
6245 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6246 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6247 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6248 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
6250 SIMDType xmm1, xmm2;
6252 for(
size_t k=kbegin; k<kend; ++k ) {
6253 const SIMDType a1(
set( A(i,k) ) );
6254 xmm1 = xmm1 + a1 * B.load(k,j );
6255 xmm2 = xmm2 + a1 * B.load(k,j1);
6258 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6259 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
6263 for( ; j<jpos; j+=SIMDSIZE )
6265 for(
size_t i=ii; i<iend; ++i )
6267 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6268 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6269 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6270 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
6274 for(
size_t k=kbegin; k<kend; ++k ) {
6275 const SIMDType a1(
set( A(i,k) ) );
6276 xmm1 = xmm1 + a1 * B.load(k,j);
6279 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
6283 for( ; remainder && j<jend; ++j )
6285 for(
size_t i=ii; i<iend; ++i )
6287 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6288 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6289 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6290 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
6294 for(
size_t k=kbegin; k<kend; ++k ) {
6295 value += A(i,k) * B(k,j);
6298 (~C)(i,j) += value * scalar;
6321 template<
typename MT3
6325 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6326 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6328 selectSmallAddAssignKernel( ~C, A, B, scalar );
6346 template<
typename MT3
6350 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
6351 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6353 selectLargeAddAssignKernel( C, A, B, scalar );
6358 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6372 template<
typename MT3
6376 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
6377 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6379 typedef ElementType_<MT3> ET;
6381 if( IsTriangular<MT4>::value ) {
6382 ResultType_<MT3> tmp(
serial( B ) );
6383 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6384 addAssign( C, tmp );
6386 else if( IsTriangular<MT5>::value ) {
6387 ResultType_<MT3> tmp(
serial( A ) );
6388 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6389 addAssign( C, tmp );
6392 gemm( C, A, B, ET(scalar), ET(1) );
6412 template<
typename MT >
6413 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
6414 addAssign( Matrix<MT,true>& lhs,
const DMatScalarMultExpr& rhs )
6423 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6424 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6426 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
6427 addAssign( ~lhs,
trans( left ) *
trans( right ) * rhs.scalar_ );
6428 else if( IsSymmetric<MT1>::value )
6429 addAssign( ~lhs,
trans( left ) * right * rhs.scalar_ );
6431 addAssign( ~lhs, left *
trans( right ) * rhs.scalar_ );
6451 template<
typename MT
6453 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
6454 subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6461 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6462 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6464 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6478 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6493 template<
typename MT3
6497 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6499 if( ( IsDiagonal<MT5>::value ) ||
6500 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6501 selectSmallSubAssignKernel( C, A, B, scalar );
6503 selectBlasSubAssignKernel( C, A, B, scalar );
6521 template<
typename MT3
6525 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6526 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6528 const ResultType tmp(
serial( A * B * scalar ) );
6529 subAssign( C, tmp );
6547 template<
typename MT3
6551 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6552 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6556 const size_t M( A.rows() );
6557 const size_t N( B.columns() );
6559 for(
size_t i=0UL; i<M; ++i )
6561 const size_t jbegin( ( IsUpper<MT4>::value )
6562 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6564 const size_t jend( ( IsLower<MT4>::value )
6565 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6569 const size_t jnum( jend - jbegin );
6570 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6572 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6573 C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
6574 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6577 C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
6597 template<
typename MT3
6601 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6602 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6606 const size_t M( A.rows() );
6607 const size_t N( B.columns() );
6609 for(
size_t i=0UL; i<M; ++i )
6611 const size_t jbegin( ( IsUpper<MT5>::value )
6612 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
6614 const size_t jend( ( IsLower<MT5>::value )
6615 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
6619 const size_t jnum( jend - jbegin );
6620 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6622 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6623 C(i,j ) -= A(i,i) * B(i,j ) * scalar;
6624 C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
6627 C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
6647 template<
typename MT3
6651 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
6652 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6656 for(
size_t i=0UL; i<A.rows(); ++i ) {
6657 C(i,i) -= A(i,i) * B(i,i) * scalar;
6676 template<
typename MT3
6680 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6681 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6683 selectDefaultSubAssignKernel( C, A, B, scalar );
6702 template<
typename MT3
6706 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6707 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6709 const size_t M( A.rows() );
6710 const size_t N( B.columns() );
6711 const size_t K( A.columns() );
6713 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6715 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
6718 const SIMDType factor(
set( scalar ) );
6722 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
6723 for(
size_t i=0UL; i<M; ++i )
6725 const size_t kbegin( ( IsUpper<MT4>::value )
6726 ?( ( IsLower<MT5>::value )
6727 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6728 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6729 :( IsLower<MT5>::value ? j : 0UL ) );
6730 const size_t kend( ( IsLower<MT4>::value )
6731 ?( ( IsUpper<MT5>::value )
6732 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
6733 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
6734 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*8UL, K ) : K ) );
6736 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6738 for(
size_t k=kbegin; k<kend; ++k ) {
6739 const SIMDType a1(
set( A(i,k) ) );
6740 xmm1 = xmm1 + a1 * B.load(k,j );
6741 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
6742 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
6743 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
6744 xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
6745 xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
6746 xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
6747 xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
6750 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6751 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
6752 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
6753 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
6754 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
6755 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
6756 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
6757 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
6761 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6765 for( ; (i+2UL) <= M; i+=2UL )
6767 const size_t kbegin( ( IsUpper<MT4>::value )
6768 ?( ( IsLower<MT5>::value )
6769 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6770 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6771 :( IsLower<MT5>::value ? j : 0UL ) );
6772 const size_t kend( ( IsLower<MT4>::value )
6773 ?( ( IsUpper<MT5>::value )
6774 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
6775 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6776 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*4UL, K ) : K ) );
6778 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6780 for(
size_t k=kbegin; k<kend; ++k ) {
6781 const SIMDType a1(
set( A(i ,k) ) );
6782 const SIMDType a2(
set( A(i+1UL,k) ) );
6783 const SIMDType b1( B.load(k,j ) );
6784 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6785 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6786 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6787 xmm1 = xmm1 + a1 * b1;
6788 xmm2 = xmm2 + a1 * b2;
6789 xmm3 = xmm3 + a1 * b3;
6790 xmm4 = xmm4 + a1 * b4;
6791 xmm5 = xmm5 + a2 * b1;
6792 xmm6 = xmm6 + a2 * b2;
6793 xmm7 = xmm7 + a2 * b3;
6794 xmm8 = xmm8 + a2 * b4;
6797 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6798 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
6799 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
6800 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
6801 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
6802 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
6803 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
6804 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
6809 const size_t kbegin( ( IsUpper<MT4>::value )
6810 ?( ( IsLower<MT5>::value )
6811 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6812 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6813 :( IsLower<MT5>::value ? j : 0UL ) );
6814 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, K ) ):( K ) );
6816 SIMDType xmm1, xmm2, xmm3, xmm4;
6818 for(
size_t k=kbegin; k<kend; ++k ) {
6819 const SIMDType a1(
set( A(i,k) ) );
6820 xmm1 = xmm1 + a1 * B.load(k,j );
6821 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
6822 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
6823 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
6826 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6827 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
6828 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
6829 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
6833 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6837 for( ; (i+2UL) <= M; i+=2UL )
6839 const size_t kbegin( ( IsUpper<MT4>::value )
6840 ?( ( IsLower<MT5>::value )
6841 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6842 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6843 :( IsLower<MT5>::value ? j : 0UL ) );
6844 const size_t kend( ( IsLower<MT4>::value )
6845 ?( ( IsUpper<MT5>::value )
6846 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
6847 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6848 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*2UL, K ) : K ) );
6850 SIMDType xmm1, xmm2, xmm3, xmm4;
6852 for(
size_t k=kbegin; k<kend; ++k ) {
6853 const SIMDType a1(
set( A(i ,k) ) );
6854 const SIMDType a2(
set( A(i+1UL,k) ) );
6855 const SIMDType b1( B.load(k,j ) );
6856 const SIMDType b2( B.load(k,j+SIMDSIZE) );
6857 xmm1 = xmm1 + a1 * b1;
6858 xmm2 = xmm2 + a1 * b2;
6859 xmm3 = xmm3 + a2 * b1;
6860 xmm4 = xmm4 + a2 * b2;
6863 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6864 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
6865 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
6866 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
6871 const size_t kbegin( ( IsUpper<MT4>::value )
6872 ?( ( IsLower<MT5>::value )
6873 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6874 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6875 :( IsLower<MT5>::value ? j : 0UL ) );
6876 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, K ) ):( K ) );
6878 SIMDType xmm1, xmm2;
6880 for(
size_t k=kbegin; k<kend; ++k ) {
6881 const SIMDType a1(
set( A(i,k) ) );
6882 xmm1 = xmm1 + a1 * B.load(k,j );
6883 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
6886 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6887 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - xmm2 * factor );
6891 for( ; j<jpos; j+=SIMDSIZE )
6895 for( ; (i+2UL) <= M; i+=2UL )
6897 const size_t kbegin( ( IsUpper<MT4>::value )
6898 ?( ( IsLower<MT5>::value )
6899 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6900 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6901 :( IsLower<MT5>::value ? j : 0UL ) );
6902 const size_t kend( ( IsLower<MT4>::value )
6903 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6906 SIMDType xmm1, xmm2;
6908 for(
size_t k=kbegin; k<kend; ++k ) {
6909 const SIMDType b1( B.load(k,j) );
6910 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
6911 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
6914 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6915 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
6920 const size_t kbegin( ( IsUpper<MT4>::value )
6921 ?( ( IsLower<MT5>::value )
6922 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6923 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6924 :( IsLower<MT5>::value ? j : 0UL ) );
6928 for(
size_t k=kbegin; k<K; ++k ) {
6929 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
6932 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
6936 for( ; remainder && j<N; ++j )
6940 for( ; (i+2UL) <= M; i+=2UL )
6942 const size_t kbegin( ( IsUpper<MT4>::value )
6943 ?( ( IsLower<MT5>::value )
6944 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6945 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6946 :( IsLower<MT5>::value ? j : 0UL ) );
6947 const size_t kend( ( IsLower<MT4>::value )
6948 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6954 for(
size_t k=kbegin; k<kend; ++k ) {
6955 value1 += A(i ,k) * B(k,j);
6956 value2 += A(i+1UL,k) * B(k,j);
6959 (~C)(i ,j) -= value1 * scalar;
6960 (~C)(i+1UL,j) -= value2 * scalar;
6965 const size_t kbegin( ( IsUpper<MT4>::value )
6966 ?( ( IsLower<MT5>::value )
6967 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6968 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6969 :( IsLower<MT5>::value ? j : 0UL ) );
6973 for(
size_t k=kbegin; k<K; ++k ) {
6974 value += A(i,k) * B(k,j);
6977 (~C)(i,j) -= value * scalar;
6997 template<
typename MT3
7001 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7002 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7009 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
7010 const OppositeType_<MT4> tmp(
serial( A ) );
7011 subAssign( ~C, tmp * B * scalar );
7013 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
7014 const OppositeType_<MT5> tmp(
serial( B ) );
7015 subAssign( ~C, A * tmp * scalar );
7017 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
7018 const OppositeType_<MT4> tmp(
serial( A ) );
7019 subAssign( ~C, tmp * B * scalar );
7022 const OppositeType_<MT5> tmp(
serial( B ) );
7023 subAssign( ~C, A * tmp * scalar );
7042 template<
typename MT3
7046 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7047 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7049 selectDefaultSubAssignKernel( C, A, B, scalar );
7068 template<
typename MT3
7072 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7073 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7075 const size_t M( A.rows() );
7076 const size_t N( B.columns() );
7077 const size_t K( A.columns() );
7079 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
7081 const SIMDType factor(
set( scalar ) );
7083 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
7085 const size_t jend(
min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
7087 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
7088 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
7090 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
7092 const size_t iend(
min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
7094 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
7096 const size_t ktmp(
min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
7100 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7102 const size_t j1( j+SIMDSIZE );
7103 const size_t j2( j+SIMDSIZE*2UL );
7104 const size_t j3( j+SIMDSIZE*3UL );
7108 for( ; (i+2UL) <= iend; i+=2UL )
7110 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7111 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7112 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7113 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
7115 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7117 for(
size_t k=kbegin; k<kend; ++k ) {
7118 const SIMDType a1(
set( A(i ,k) ) );
7119 const SIMDType a2(
set( A(i+1UL,k) ) );
7120 const SIMDType b1( B.load(k,j ) );
7121 const SIMDType b2( B.load(k,j1) );
7122 const SIMDType b3( B.load(k,j2) );
7123 const SIMDType b4( B.load(k,j3) );
7124 xmm1 = xmm1 + a1 * b1;
7125 xmm2 = xmm2 + a1 * b2;
7126 xmm3 = xmm3 + a1 * b3;
7127 xmm4 = xmm4 + a1 * b4;
7128 xmm5 = xmm5 + a2 * b1;
7129 xmm6 = xmm6 + a2 * b2;
7130 xmm7 = xmm7 + a2 * b3;
7131 xmm8 = xmm8 + a2 * b4;
7134 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7135 (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
7136 (~C).store( i , j2, (~C).load(i ,j2) - xmm3 * factor );
7137 (~C).store( i , j3, (~C).load(i ,j3) - xmm4 * factor );
7138 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
7139 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm6 * factor );
7140 (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) - xmm7 * factor );
7141 (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) - xmm8 * factor );
7146 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7147 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7148 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7149 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
7151 SIMDType xmm1, xmm2, xmm3, xmm4;
7153 for(
size_t k=kbegin; k<kend; ++k ) {
7154 const SIMDType a1(
set( A(i,k) ) );
7155 xmm1 = xmm1 + a1 * B.load(k,j );
7156 xmm2 = xmm2 + a1 * B.load(k,j1);
7157 xmm3 = xmm3 + a1 * B.load(k,j2);
7158 xmm4 = xmm4 + a1 * B.load(k,j3);
7161 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7162 (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
7163 (~C).store( i, j2, (~C).load(i,j2) - xmm3 * factor );
7164 (~C).store( i, j3, (~C).load(i,j3) - xmm4 * factor );
7168 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
7170 const size_t j1( j+SIMDSIZE );
7174 for( ; (i+4UL) <= iend; i+=4UL )
7176 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7177 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7178 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
7179 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
7181 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7183 for(
size_t k=kbegin; k<kend; ++k ) {
7184 const SIMDType a1(
set( A(i ,k) ) );
7185 const SIMDType a2(
set( A(i+1UL,k) ) );
7186 const SIMDType a3(
set( A(i+2UL,k) ) );
7187 const SIMDType a4(
set( A(i+3UL,k) ) );
7188 const SIMDType b1( B.load(k,j ) );
7189 const SIMDType b2( B.load(k,j1) );
7190 xmm1 = xmm1 + a1 * b1;
7191 xmm2 = xmm2 + a1 * b2;
7192 xmm3 = xmm3 + a2 * b1;
7193 xmm4 = xmm4 + a2 * b2;
7194 xmm5 = xmm5 + a3 * b1;
7195 xmm6 = xmm6 + a3 * b2;
7196 xmm7 = xmm7 + a4 * b1;
7197 xmm8 = xmm8 + a4 * b2;
7200 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7201 (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
7202 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
7203 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
7204 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
7205 (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) - xmm6 * factor );
7206 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
7207 (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) - xmm8 * factor );
7210 for( ; (i+2UL) <= iend; i+=2UL )
7212 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7213 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7214 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7215 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
7217 SIMDType xmm1, xmm2, xmm3, xmm4;
7219 for(
size_t k=kbegin; k<kend; ++k ) {
7220 const SIMDType a1(
set( A(i ,k) ) );
7221 const SIMDType a2(
set( A(i+1UL,k) ) );
7222 const SIMDType b1( B.load(k,j ) );
7223 const SIMDType b2( B.load(k,j1) );
7224 xmm1 = xmm1 + a1 * b1;
7225 xmm2 = xmm2 + a1 * b2;
7226 xmm3 = xmm3 + a2 * b1;
7227 xmm4 = xmm4 + a2 * b2;
7230 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7231 (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
7232 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
7233 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
7238 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7239 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7240 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7241 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
7243 SIMDType xmm1, xmm2;
7245 for(
size_t k=kbegin; k<kend; ++k ) {
7246 const SIMDType a1(
set( A(i,k) ) );
7247 xmm1 = xmm1 + a1 * B.load(k,j );
7248 xmm2 = xmm2 + a1 * B.load(k,j1);
7251 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7252 (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
7256 for( ; j<jpos; j+=SIMDSIZE )
7258 for(
size_t i=ii; i<iend; ++i )
7260 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7261 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7262 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7263 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
7267 for(
size_t k=kbegin; k<kend; ++k ) {
7268 const SIMDType a1(
set( A(i,k) ) );
7269 xmm1 = xmm1 + a1 * B.load(k,j);
7272 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
7276 for( ; remainder && j<jend; ++j )
7278 for(
size_t i=ii; i<iend; ++i )
7280 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7281 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7282 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7283 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
7287 for(
size_t k=kbegin; k<kend; ++k ) {
7288 value += A(i,k) * B(k,j);
7291 (~C)(i,j) -= value * scalar;
7314 template<
typename MT3
7318 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7319 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7321 selectSmallSubAssignKernel( ~C, A, B, scalar );
7339 template<
typename MT3
7343 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7344 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7346 selectLargeSubAssignKernel( C, A, B, scalar );
7351 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7365 template<
typename MT3
7369 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7370 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7372 typedef ElementType_<MT3> ET;
7374 if( IsTriangular<MT4>::value ) {
7375 ResultType_<MT3> tmp(
serial( B ) );
7376 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7377 subAssign( C, tmp );
7379 else if( IsTriangular<MT5>::value ) {
7380 ResultType_<MT3> tmp(
serial( A ) );
7381 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7382 subAssign( C, tmp );
7385 gemm( C, A, B, ET(-scalar), ET(1) );
7405 template<
typename MT >
7406 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7407 subAssign( Matrix<MT,true>& lhs,
const DMatScalarMultExpr& rhs )
7416 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7417 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7419 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7420 subAssign( ~lhs,
trans( left ) *
trans( right ) * rhs.scalar_ );
7421 else if( IsSymmetric<MT1>::value )
7422 subAssign( ~lhs,
trans( left ) * right * rhs.scalar_ );
7424 subAssign( ~lhs, left *
trans( right ) * rhs.scalar_ );
7455 template<
typename MT
7457 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7458 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7465 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7466 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7468 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7471 else if( left.columns() == 0UL ) {
7505 template<
typename MT
7507 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7508 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7512 typedef IfTrue_< SO, OppositeType, ResultType > TmpType;
7524 const TmpType tmp( rhs );
7543 template<
typename MT >
7544 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7545 smpAssign( Matrix<MT,true>& lhs,
const DMatScalarMultExpr& rhs )
7554 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7555 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7557 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7559 else if( IsSymmetric<MT1>::value )
7581 template<
typename MT
7583 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7584 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7591 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7592 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7594 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7626 template<
typename MT >
7627 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7628 smpAddAssign( Matrix<MT,true>& lhs,
const DMatScalarMultExpr& rhs )
7637 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7638 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7640 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7642 else if( IsSymmetric<MT1>::value )
7668 template<
typename MT
7670 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7671 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7678 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7679 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7681 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7713 template<
typename MT >
7714 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7715 smpSubAssign( Matrix<MT,true>& lhs,
const DMatScalarMultExpr& rhs )
7724 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7725 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7727 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7729 else if( IsSymmetric<MT1>::value )
7797 template<
typename T1
7799 inline const DMatDMatMultExpr<T1,T2>
7823 template<
typename MT1,
typename MT2 >
7840 template<
typename MT1,
typename MT2 >
7857 template<
typename MT1,
typename MT2 >
7859 :
public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7875 template<
typename MT1,
typename MT2 >
7877 :
public BoolConstant< And< IsLower<MT1>, IsLower<MT2> >::value >
7893 template<
typename MT1,
typename MT2 >
7895 :
public BoolConstant< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
7911 template<
typename MT1,
typename MT2 >
7913 :
public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7914 , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
7930 template<
typename MT1,
typename MT2 >
7932 :
public BoolConstant< And< IsUpper<MT1>, IsUpper<MT2> >::value >
7948 template<
typename MT1,
typename MT2 >
7950 :
public BoolConstant< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
7966 template<
typename MT1,
typename MT2 >
7968 :
public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
7969 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
7985 template<
typename MT1,
typename MT2,
typename VT >
7990 using Type = If_< And< IsDenseMatrix<MT1>, IsRowMajorMatrix<MT1>
7991 , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2>
7992 , IsDenseVector<VT>, IsColumnVector<VT> >
7993 , DMatDVecMultExprTrait_< MT1, DMatDVecMultExprTrait_<MT2,VT> >
8003 template<
typename MT1,
typename MT2,
typename VT >
8008 using Type = If_< And< IsDenseMatrix<MT1>, IsRowMajorMatrix<MT1>
8009 , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2>
8010 , IsSparseVector<VT>, IsColumnVector<VT> >
8011 , DMatDVecMultExprTrait_< MT1, DMatSVecMultExprTrait_<MT2,VT> >
8021 template<
typename VT,
typename MT1,
typename MT2 >
8026 using Type = If_< And< IsDenseVector<VT>, IsRowVector<VT>
8027 , IsDenseMatrix<MT1>, IsRowMajorMatrix<MT1>
8028 , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2> >
8029 , TDVecDMatMultExprTrait_< TDVecDMatMultExprTrait_<VT,MT1>, MT2 >
8039 template<
typename VT,
typename MT1,
typename MT2 >
8044 using Type = If_< And< IsSparseVector<VT>, IsRowVector<VT>
8045 , IsDenseMatrix<MT1>, IsRowMajorMatrix<MT1>
8046 , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2> >
8047 , TDVecDMatMultExprTrait_< TSVecDMatMultExprTrait_<VT,MT1>, MT2 >
8057 template<
typename MT1,
typename MT2,
bool AF >
8062 using Type = MultExprTrait_< SubmatrixExprTrait_<const MT1,AF>
8063 , SubmatrixExprTrait_<const MT2,AF> >;
8072 template<
typename MT1,
typename MT2 >
8077 using Type = MultExprTrait_< RowExprTrait_<const MT1>, MT2 >;
8086 template<
typename MT1,
typename MT2 >
8091 using Type = MultExprTrait_< MT1, ColumnExprTrait_<const MT2> >;
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:250
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
Header file for the SparseVector base class.
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:291
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the ColumnExprTrait class template.
DMatDMatMultExpr< MT1, MT2 > This
Type of this DMatDMatMultExpr instance.
Definition: DMatDMatMultExpr.h:246
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:390
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:160
Header file for the IsRowVector type trait.
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:223
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1669
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Expression object for dense matrix-dense matrix multiplications.The DMatDMatMultExpr class represents...
Definition: DMatDMatMultExpr.h:149
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:249
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1716
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:126
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDMatMultExpr.h:251
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:400
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:247
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:248
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:262
If_< IsExpression< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:259
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:156
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:434
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:155
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:254
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:354
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:157
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:109
Header file for the IsNumeric type trait.
BLAZE_ALWAYS_INLINE const EnableIf_< And< IsIntegral< T >, HasSize< T, 1UL > >, If_< IsSigned< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:76
Header file for the HasConstDataAccess type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:126
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:306
System settings for the BLAS mode.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:453
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
Header file for the IsSparseVector type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:412
If_< IsExpression< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:256
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Utility type for generic codes.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:253
Constraint on the data type.
Constraints on the storage order of matrix types.
Header file for the HasMutableDataAccess type trait.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:444
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:370
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:243
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Header file for the IsDenseVector type trait.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:252
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:424
Header file for the AreSIMDCombinable type trait.
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:158
Header file for the IsRowMajorMatrix type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:380
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
const DMatTransExpr< MT,!SO > trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:950
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Header file for the IsComplex type trait.
Header file for the TSVecDMatMultExprTrait class template.
Header file for the complex data type.
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:265
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:454
Header file for the IsResizable type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:159
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.