Blaze 3.9
TDMatTDMatMultExpr.h
Go to the documentation of this file.
1//=================================================================================================
33//=================================================================================================
34
35#ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
37
38
39//*************************************************************************************************
40// Includes
41//*************************************************************************************************
42
45#include <blaze/math/Aliases.h>
71#include <blaze/math/SIMD.h>
103#include <blaze/system/BLAS.h>
110#include <blaze/util/Assert.h>
111#include <blaze/util/Complex.h>
113#include <blaze/util/EnableIf.h>
116#include <blaze/util/mpl/If.h>
117#include <blaze/util/Types.h>
126
127
128namespace blaze {
129
130//=================================================================================================
131//
132// CLASS TDMATTDMATMULTEXPR
133//
134//=================================================================================================
135
136//*************************************************************************************************
143template< typename MT1 // Type of the left-hand side dense matrix
144 , typename MT2 // Type of the right-hand side dense matrix
145 , bool SF // Symmetry flag
146 , bool HF // Hermitian flag
147 , bool LF // Lower flag
148 , bool UF > // Upper flag
150 : public MatMatMultExpr< DenseMatrix< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
151 , private Computation
152{
153 private:
154 //**Type definitions****************************************************************************
161 //**********************************************************************************************
162
163 //**********************************************************************************************
165 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
166 //**********************************************************************************************
167
168 //**********************************************************************************************
170 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
171 //**********************************************************************************************
172
173 //**********************************************************************************************
174 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
175 static constexpr bool HERM = ( HF && !( LF || UF ) );
176 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
177 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
178 //**********************************************************************************************
179
180 //**********************************************************************************************
182
187 template< typename T1, typename T2, typename T3 >
188 static constexpr bool CanExploitSymmetry_v =
189 ( IsRowMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
191 //**********************************************************************************************
192
193 //**********************************************************************************************
195
199 template< typename T1, typename T2, typename T3 >
200 static constexpr bool IsEvaluationRequired_v =
201 ( ( evaluateLeft || evaluateRight ) && CanExploitSymmetry_v<T1,T2,T3> );
203 //**********************************************************************************************
204
205 //**********************************************************************************************
207
210 template< typename T1, typename T2, typename T3 >
211 static constexpr bool UseBlasKernel_v =
212 ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
213 !SYM && !HERM && !LOW && !UPP &&
214 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
215 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
216 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
217 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
218 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
219 IsBLASCompatible_v< ElementType_t<T1> > &&
220 IsBLASCompatible_v< ElementType_t<T2> > &&
221 IsBLASCompatible_v< ElementType_t<T3> > &&
222 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
223 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
225 //**********************************************************************************************
226
227 //**********************************************************************************************
229
232 template< typename T1, typename T2, typename T3 >
233 static constexpr bool UseVectorizedDefaultKernel_v =
234 ( useOptimizedKernels &&
235 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
236 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
237 IsSIMDCombinable_v< ElementType_t<T1>
239 , ElementType_t<T3> > &&
240 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
241 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
243 //**********************************************************************************************
244
245 //**********************************************************************************************
247
250 using ForwardFunctor = If_t< HERM
251 , DeclHerm
252 , If_t< SYM
253 , DeclSym
254 , If_t< LOW
255 , If_t< UPP
256 , DeclDiag
257 , DeclLow >
258 , If_t< UPP
259 , DeclUpp
260 , Noop > > > >;
262 //**********************************************************************************************
263
264 public:
265 //**Type definitions****************************************************************************
268
271
273 using ResultType = typename If_t< HERM
275 , If_t< SYM
277 , If_t< LOW
278 , If_t< UPP
281 , If_t< UPP
283 , MultTrait<RT1,RT2> > > > >::Type;
284
289 using ReturnType = const ElementType;
290 using CompositeType = const ResultType;
291
293 using LeftOperand = If_t< IsExpression_v<MT1>, const MT1, const MT1& >;
294
296 using RightOperand = If_t< IsExpression_v<MT2>, const MT2, const MT2& >;
297
300
303 //**********************************************************************************************
304
305 //**Compilation flags***************************************************************************
307 static constexpr bool simdEnabled =
308 ( !IsDiagonal_v<MT1> &&
309 MT1::simdEnabled && MT2::simdEnabled &&
310 HasSIMDAdd_v<ET1,ET2> &&
311 HasSIMDMult_v<ET1,ET2> );
312
314 static constexpr bool smpAssignable =
315 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
316 //**********************************************************************************************
317
318 //**SIMD properties*****************************************************************************
320 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
321 //**********************************************************************************************
322
323 //**Constructor*********************************************************************************
329 inline TDMatTDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
330 : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
331 , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
332 {
333 BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
334 }
335 //**********************************************************************************************
336
337 //**Access operator*****************************************************************************
344 inline ReturnType operator()( size_t i, size_t j ) const {
345 BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
346 BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
347
348 if( IsDiagonal_v<MT1> ) {
349 return lhs_(i,i) * rhs_(i,j);
350 }
351 else if( IsDiagonal_v<MT2> ) {
352 return lhs_(i,j) * rhs_(j,j);
353 }
354 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
355 const size_t begin( ( IsUpper_v<MT1> )
356 ?( ( IsLower_v<MT2> )
357 ?( max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
358 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
359 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
360 :( ( IsLower_v<MT2> )
361 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
362 :( 0UL ) ) );
363 const size_t end( ( IsLower_v<MT1> )
364 ?( ( IsUpper_v<MT2> )
365 ?( min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
366 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
367 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
368 :( ( IsUpper_v<MT2> )
369 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
370 :( lhs_.columns() ) ) );
371
372 if( begin >= end ) return ElementType();
373
374 const size_t n( end - begin );
375
376 return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
378 }
379 else {
380 return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
381 }
382 }
383 //**********************************************************************************************
384
385 //**At function*********************************************************************************
393 inline ReturnType at( size_t i, size_t j ) const {
394 if( i >= lhs_.rows() ) {
395 BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
396 }
397 if( j >= rhs_.columns() ) {
398 BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
399 }
400 return (*this)(i,j);
401 }
402 //**********************************************************************************************
403
404 //**Rows function*******************************************************************************
409 inline size_t rows() const noexcept {
410 return lhs_.rows();
411 }
412 //**********************************************************************************************
413
414 //**Columns function****************************************************************************
419 inline size_t columns() const noexcept {
420 return rhs_.columns();
421 }
422 //**********************************************************************************************
423
424 //**Left operand access*************************************************************************
429 inline LeftOperand leftOperand() const noexcept {
430 return lhs_;
431 }
432 //**********************************************************************************************
433
434 //**Right operand access************************************************************************
439 inline RightOperand rightOperand() const noexcept {
440 return rhs_;
441 }
442 //**********************************************************************************************
443
444 //**********************************************************************************************
450 template< typename T >
451 inline bool canAlias( const T* alias ) const noexcept {
452 return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
453 }
454 //**********************************************************************************************
455
456 //**********************************************************************************************
462 template< typename T >
463 inline bool isAliased( const T* alias ) const noexcept {
464 return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
465 }
466 //**********************************************************************************************
467
468 //**********************************************************************************************
473 inline bool isAligned() const noexcept {
474 return lhs_.isAligned() && rhs_.isAligned();
475 }
476 //**********************************************************************************************
477
478 //**********************************************************************************************
483 inline bool canSMPAssign() const noexcept {
484 return ( !BLAZE_BLAS_MODE ||
485 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
487 ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
488 ( rows() * columns() >= SMP_TDMATTDMATMULT_THRESHOLD ) &&
489 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
490 }
491 //**********************************************************************************************
492
493 private:
494 //**Member variables****************************************************************************
497 //**********************************************************************************************
498
499 //**Assignment to dense matrices****************************************************************
512 template< typename MT // Type of the target dense matrix
513 , bool SO > // Storage order of the target dense matrix
514 friend inline auto assign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
516 {
518
519 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
520 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
521
522 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
523 return;
524 }
525 else if( rhs.lhs_.columns() == 0UL ) {
526 reset( *lhs );
527 return;
528 }
529
530 LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
531 RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
532
533 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
534 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
535 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
536 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
537 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
538 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
539
540 TDMatTDMatMultExpr::selectAssignKernel( *lhs, A, B );
541 }
543 //**********************************************************************************************
544
545 //**Assignment to dense matrices (kernel selection)*********************************************
556 template< typename MT3 // Type of the left-hand side target matrix
557 , typename MT4 // Type of the left-hand side matrix operand
558 , typename MT5 > // Type of the right-hand side matrix operand
559 static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
560 {
561 if( ( IsDiagonal_v<MT4> ) ||
562 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
563 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
564 selectSmallAssignKernel( C, A, B );
565 else
566 selectBlasAssignKernel( C, A, B );
567 }
569 //**********************************************************************************************
570
571 //**Default assignment to dense matrices (general/general)**************************************
585 template< typename MT3 // Type of the left-hand side target matrix
586 , typename MT4 // Type of the left-hand side matrix operand
587 , typename MT5 > // Type of the right-hand side matrix operand
588 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
589 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
590 {
591 const size_t M( A.rows() );
592 const size_t N( B.columns() );
593 const size_t K( A.columns() );
594
595 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
596
597 for( size_t j=0UL; j<N; ++j )
598 {
599 const size_t kbegin( ( IsLower_v<MT5> )
600 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
601 :( 0UL ) );
602 const size_t kend( ( IsUpper_v<MT5> )
603 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
604 :( K ) );
605 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
606
607 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
608 for( size_t i=0UL; i<M; ++i ) {
609 reset( C(i,j) );
610 }
611 continue;
612 }
613
614 {
615 const size_t ibegin( ( IsLower_v<MT4> )
616 ?( ( IsStrictlyLower_v<MT4> )
617 ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
618 :( LOW ? max(j,kbegin) : kbegin ) )
619 :( LOW ? j : 0UL ) );
620 const size_t iend( ( IsUpper_v<MT4> )
621 ?( ( IsStrictlyUpper_v<MT4> )
622 ?( UPP ? min(j+1UL,kbegin) : kbegin )
623 :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
624 :( UPP ? j+1UL : M ) );
625
626 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
627 for( size_t i=0UL; i<ibegin; ++i ) {
628 reset( C(i,j) );
629 }
630 }
631 else if( IsStrictlyLower_v<MT4> ) {
632 reset( C(0UL,j) );
633 }
634 for( size_t i=ibegin; i<iend; ++i ) {
635 C(i,j) = A(i,kbegin) * B(kbegin,j);
636 }
637 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
638 for( size_t i=iend; i<M; ++i ) {
639 reset( C(i,j) );
640 }
641 }
642 else if( IsStrictlyUpper_v<MT4> ) {
643 reset( C(M-1UL,j) );
644 }
645 }
646
647 for( size_t k=kbegin+1UL; k<kend; ++k )
648 {
649 const size_t ibegin( ( IsLower_v<MT4> )
650 ?( ( IsStrictlyLower_v<MT4> )
651 ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
652 :( SYM || HERM || LOW ? max( j, k ) : k ) )
653 :( SYM || HERM || LOW ? j : 0UL ) );
654 const size_t iend( ( IsUpper_v<MT4> )
655 ?( ( IsStrictlyUpper_v<MT4> )
656 ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
657 :( UPP ? min(j+1UL,k) : k ) )
658 :( UPP ? j+1UL : M ) );
659
660 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
661 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
662
663 for( size_t i=ibegin; i<iend; ++i ) {
664 C(i,j) += A(i,k) * B(k,j);
665 }
666 if( IsUpper_v<MT4> ) {
667 C(iend,j) = A(iend,k) * B(k,j);
668 }
669 }
670 }
671
672 if( SYM || HERM ) {
673 for( size_t j=1UL; j<N; ++j ) {
674 for( size_t i=0UL; i<j; ++i ) {
675 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
676 }
677 }
678 }
679 }
681 //**********************************************************************************************
682
683 //**Default assignment to dense matrices (general/diagonal)*************************************
697 template< typename MT3 // Type of the left-hand side target matrix
698 , typename MT4 // Type of the left-hand side matrix operand
699 , typename MT5 > // Type of the right-hand side matrix operand
700 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
701 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
702 {
704
705 const size_t M( A.rows() );
706 const size_t N( B.columns() );
707
708 for( size_t j=0UL; j<N; ++j )
709 {
710 const size_t ibegin( ( IsLower_v<MT4> )
711 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
712 :( 0UL ) );
713 const size_t iend( ( IsUpper_v<MT4> )
714 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
715 :( M ) );
716 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
717
718 if( IsLower_v<MT4> ) {
719 for( size_t i=0UL; i<ibegin; ++i ) {
720 reset( C(i,j) );
721 }
722 }
723 for( size_t i=ibegin; i<iend; ++i ) {
724 C(i,j) = A(i,j) * B(j,j);
725 }
726 if( IsUpper_v<MT4> ) {
727 for( size_t i=iend; i<M; ++i ) {
728 reset( C(i,j) );
729 }
730 }
731 }
732 }
734 //**********************************************************************************************
735
736 //**Default assignment to dense matrices (diagonal/general)*************************************
750 template< typename MT3 // Type of the left-hand side target matrix
751 , typename MT4 // Type of the left-hand side matrix operand
752 , typename MT5 > // Type of the right-hand side matrix operand
753 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
754 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
755 {
757
758 const size_t M( A.rows() );
759 const size_t N( B.columns() );
760
761 for( size_t j=0UL; j<N; ++j )
762 {
763 const size_t ibegin( ( IsLower_v<MT5> )
764 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
765 :( 0UL ) );
766 const size_t iend( ( IsUpper_v<MT5> )
767 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
768 :( M ) );
769 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
770
771 if( IsLower_v<MT4> ) {
772 for( size_t i=0UL; i<ibegin; ++i ) {
773 reset( C(i,j) );
774 }
775 }
776 for( size_t i=ibegin; i<iend; ++i ) {
777 C(i,j) = A(i,i) * B(i,j);
778 }
779 if( IsUpper_v<MT4> ) {
780 for( size_t i=iend; i<M; ++i ) {
781 reset( C(i,j) );
782 }
783 }
784 }
785 }
787 //**********************************************************************************************
788
789 //**Default assignment to dense matrices (diagonal/diagonal)************************************
803 template< typename MT3 // Type of the left-hand side target matrix
804 , typename MT4 // Type of the left-hand side matrix operand
805 , typename MT5 > // Type of the right-hand side matrix operand
806 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
807 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
808 {
810
811 reset( C );
812
813 for( size_t i=0UL; i<A.rows(); ++i ) {
814 C(i,i) = A(i,i) * B(i,i);
815 }
816 }
818 //**********************************************************************************************
819
820 //**Default assignment to dense matrices (small matrices)***************************************
834 template< typename MT3 // Type of the left-hand side target matrix
835 , typename MT4 // Type of the left-hand side matrix operand
836 , typename MT5 > // Type of the right-hand side matrix operand
837 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
838 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
839 {
840 selectDefaultAssignKernel( C, A, B );
841 }
843 //**********************************************************************************************
844
845 //**Vectorized default assignment to row-major dense matrices (small matrices)******************
860 template< typename MT3 // Type of the left-hand side target matrix
861 , typename MT4 // Type of the left-hand side matrix operand
862 , typename MT5 > // Type of the right-hand side matrix operand
863 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
864 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
865 {
870
871 const ForwardFunctor fwd;
872
873 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
874 const OppositeType_t<MT5> tmp( serial( B ) );
875 assign( C, fwd( A * tmp ) );
876 }
877 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
878 const OppositeType_t<MT4> tmp( serial( A ) );
879 assign( C, fwd( tmp * B ) );
880 }
881 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
882 const OppositeType_t<MT5> tmp( serial( B ) );
883 assign( C, fwd( A * tmp ) );
884 }
885 else {
886 const OppositeType_t<MT4> tmp( serial( A ) );
887 assign( C, fwd( tmp * B ) );
888 }
889 }
891 //**********************************************************************************************
892
893 //**Vectorized default assignment to column-major dense matrices (small matrices)***************
908 template< typename MT3 // Type of the left-hand side target matrix
909 , typename MT4 // Type of the left-hand side matrix operand
910 , typename MT5 > // Type of the right-hand side matrix operand
911 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
912 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
913 {
914 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
915
916 const size_t M( A.rows() );
917 const size_t N( B.columns() );
918 const size_t K( A.columns() );
919
920 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
921
922 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
923 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
924
925 size_t i( 0UL );
926
927 if( IsIntegral_v<ElementType> )
928 {
929 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
930 for( size_t j=0UL; j<N; ++j )
931 {
932 const size_t kbegin( ( IsLower_v<MT5> )
933 ?( ( IsUpper_v<MT4> )
934 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
935 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
936 :( IsUpper_v<MT4> ? i : 0UL ) );
937 const size_t kend( ( IsUpper_v<MT5> )
938 ?( ( IsLower_v<MT4> )
939 ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
940 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
941 :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
942
943 size_t k( kbegin );
944
945 if( k < kend )
946 {
947 SIMDType b1( set( B(k,j) ) );
948 SIMDType xmm1( A.load(i ,k) * b1 );
949 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
950 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
951 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
952 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
953 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,k) * b1 );
954 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,k) * b1 );
955 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,k) * b1 );
956
957 for( ++k; k<kend; ++k ) {
958 b1 = set( B(k,j) );
959 xmm1 += A.load(i ,k) * b1;
960 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
961 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
962 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
963 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
964 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
965 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
966 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
967 }
968
969 C.store( i , j, xmm1 );
970 C.store( i+SIMDSIZE , j, xmm2 );
971 C.store( i+SIMDSIZE*2UL, j, xmm3 );
972 C.store( i+SIMDSIZE*3UL, j, xmm4 );
973 C.store( i+SIMDSIZE*4UL, j, xmm5 );
974 C.store( i+SIMDSIZE*5UL, j, xmm6 );
975 C.store( i+SIMDSIZE*6UL, j, xmm7 );
976 C.store( i+SIMDSIZE*7UL, j, xmm8 );
977 }
978 else
979 {
980 const SIMDType zero;
981 C.store( i , j, zero );
982 C.store( i+SIMDSIZE , j, zero );
983 C.store( i+SIMDSIZE*2UL, j, zero );
984 C.store( i+SIMDSIZE*3UL, j, zero );
985 C.store( i+SIMDSIZE*4UL, j, zero );
986 C.store( i+SIMDSIZE*5UL, j, zero );
987 C.store( i+SIMDSIZE*6UL, j, zero );
988 C.store( i+SIMDSIZE*7UL, j, zero );
989 }
990 }
991 }
992 }
993
994 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
995 {
996 size_t j( 0UL );
997
998 for( ; (j+2UL) <= N; j+=2UL )
999 {
1000 const size_t kbegin( ( IsLower_v<MT5> )
1001 ?( ( IsUpper_v<MT4> )
1002 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1003 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1004 :( IsUpper_v<MT4> ? i : 0UL ) );
1005 const size_t kend( ( IsUpper_v<MT5> )
1006 ?( ( IsLower_v<MT4> )
1007 ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1008 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1009 :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
1010
1011 size_t k( kbegin );
1012
1013 if( k < kend )
1014 {
1015 SIMDType a1( A.load(i ,k) );
1016 SIMDType a2( A.load(i+SIMDSIZE ,k) );
1017 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1018 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1019 SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
1020 SIMDType b1( set( B(k,j ) ) );
1021 SIMDType b2( set( B(k,j+1UL) ) );
1022 SIMDType xmm1 ( a1 * b1 );
1023 SIMDType xmm2 ( a2 * b1 );
1024 SIMDType xmm3 ( a3 * b1 );
1025 SIMDType xmm4 ( a4 * b1 );
1026 SIMDType xmm5 ( a5 * b1 );
1027 SIMDType xmm6 ( a1 * b2 );
1028 SIMDType xmm7 ( a2 * b2 );
1029 SIMDType xmm8 ( a3 * b2 );
1030 SIMDType xmm9 ( a4 * b2 );
1031 SIMDType xmm10( a5 * b2 );
1032
1033 for( ++k; k<kend; ++k ) {
1034 a1 = A.load(i ,k);
1035 a2 = A.load(i+SIMDSIZE ,k);
1036 a3 = A.load(i+SIMDSIZE*2UL,k);
1037 a4 = A.load(i+SIMDSIZE*3UL,k);
1038 a5 = A.load(i+SIMDSIZE*4UL,k);
1039 b1 = set( B(k,j ) );
1040 b2 = set( B(k,j+1UL) );
1041 xmm1 += a1 * b1;
1042 xmm2 += a2 * b1;
1043 xmm3 += a3 * b1;
1044 xmm4 += a4 * b1;
1045 xmm5 += a5 * b1;
1046 xmm6 += a1 * b2;
1047 xmm7 += a2 * b2;
1048 xmm8 += a3 * b2;
1049 xmm9 += a4 * b2;
1050 xmm10 += a5 * b2;
1051 }
1052
1053 C.store( i , j , xmm1 );
1054 C.store( i+SIMDSIZE , j , xmm2 );
1055 C.store( i+SIMDSIZE*2UL, j , xmm3 );
1056 C.store( i+SIMDSIZE*3UL, j , xmm4 );
1057 C.store( i+SIMDSIZE*4UL, j , xmm5 );
1058 C.store( i , j+1UL, xmm6 );
1059 C.store( i+SIMDSIZE , j+1UL, xmm7 );
1060 C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1061 C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1062 C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1063 }
1064 else
1065 {
1066 const SIMDType zero;
1067 C.store( i , j , zero );
1068 C.store( i+SIMDSIZE , j , zero );
1069 C.store( i+SIMDSIZE*2UL, j , zero );
1070 C.store( i+SIMDSIZE*3UL, j , zero );
1071 C.store( i+SIMDSIZE*4UL, j , zero );
1072 C.store( i , j+1UL, zero );
1073 C.store( i+SIMDSIZE , j+1UL, zero );
1074 C.store( i+SIMDSIZE*2UL, j+1UL, zero );
1075 C.store( i+SIMDSIZE*3UL, j+1UL, zero );
1076 C.store( i+SIMDSIZE*4UL, j+1UL, zero );
1077 }
1078 }
1079
1080 if( j < N )
1081 {
1082 const size_t kbegin( ( IsLower_v<MT5> )
1083 ?( ( IsUpper_v<MT4> )
1084 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1085 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1086 :( IsUpper_v<MT4> ? i : 0UL ) );
1087 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
1088
1089 size_t k( kbegin );
1090
1091 if( k < kend )
1092 {
1093 SIMDType b1( set( B(k,j) ) );
1094 SIMDType xmm1( A.load(i ,k) * b1 );
1095 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
1096 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
1097 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
1098 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
1099
1100 for( ++k; k<kend; ++k ) {
1101 b1 = set( B(k,j) );
1102 xmm1 += A.load(i ,k) * b1;
1103 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1104 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1105 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1106 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1107 }
1108
1109 C.store( i , j, xmm1 );
1110 C.store( i+SIMDSIZE , j, xmm2 );
1111 C.store( i+SIMDSIZE*2UL, j, xmm3 );
1112 C.store( i+SIMDSIZE*3UL, j, xmm4 );
1113 C.store( i+SIMDSIZE*4UL, j, xmm5 );
1114 }
1115 else
1116 {
1117 const SIMDType zero;
1118 C.store( i , j, zero );
1119 C.store( i+SIMDSIZE , j, zero );
1120 C.store( i+SIMDSIZE*2UL, j, zero );
1121 C.store( i+SIMDSIZE*3UL, j, zero );
1122 C.store( i+SIMDSIZE*4UL, j, zero );
1123 }
1124 }
1125 }
1126
1127 for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1128 {
1129 const size_t jend( LOW ? min(i+SIMDSIZE*4UL,N) : N );
1130 size_t j( 0UL );
1131
1132 if( SYM || HERM ) {
1133 const size_t iiend( min(i+SIMDSIZE*4UL,M) );
1134 for( ; j<i; ++j ) {
1135 for( size_t ii=i; ii<iiend; ++ii ) {
1136 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
1137 }
1138 }
1139 }
1140 else if( UPP ) {
1141 const size_t iiend( min(i+SIMDSIZE*4UL,M) );
1142 for( ; j<i; ++j ) {
1143 for( size_t ii=i; ii<iiend; ++ii ) {
1144 reset( C(ii,j) );
1145 }
1146 }
1147 }
1148
1149 for( ; (j+2UL) <= jend; j+=2UL )
1150 {
1151 const size_t kbegin( ( IsLower_v<MT5> )
1152 ?( ( IsUpper_v<MT4> )
1153 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1154 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1155 :( IsUpper_v<MT4> ? i : 0UL ) );
1156 const size_t kend( ( IsUpper_v<MT5> )
1157 ?( ( IsLower_v<MT4> )
1158 ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1159 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1160 :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
1161
1162 size_t k( kbegin );
1163
1164 if( k < kend )
1165 {
1166 SIMDType a1( A.load(i ,k) );
1167 SIMDType a2( A.load(i+SIMDSIZE ,k) );
1168 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1169 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1170 SIMDType b1( set( B(k,j ) ) );
1171 SIMDType b2( set( B(k,j+1UL) ) );
1172 SIMDType xmm1( a1 * b1 );
1173 SIMDType xmm2( a2 * b1 );
1174 SIMDType xmm3( a3 * b1 );
1175 SIMDType xmm4( a4 * b1 );
1176 SIMDType xmm5( a1 * b2 );
1177 SIMDType xmm6( a2 * b2 );
1178 SIMDType xmm7( a3 * b2 );
1179 SIMDType xmm8( a4 * b2 );
1180
1181 for( ++k; k<kend; ++k ) {
1182 a1 = A.load(i ,k);
1183 a2 = A.load(i+SIMDSIZE ,k);
1184 a3 = A.load(i+SIMDSIZE*2UL,k);
1185 a4 = A.load(i+SIMDSIZE*3UL,k);
1186 b1 = set( B(k,j ) );
1187 b2 = set( B(k,j+1UL) );
1188 xmm1 += a1 * b1;
1189 xmm2 += a2 * b1;
1190 xmm3 += a3 * b1;
1191 xmm4 += a4 * b1;
1192 xmm5 += a1 * b2;
1193 xmm6 += a2 * b2;
1194 xmm7 += a3 * b2;
1195 xmm8 += a4 * b2;
1196 }
1197
1198 C.store( i , j , xmm1 );
1199 C.store( i+SIMDSIZE , j , xmm2 );
1200 C.store( i+SIMDSIZE*2UL, j , xmm3 );
1201 C.store( i+SIMDSIZE*3UL, j , xmm4 );
1202 C.store( i , j+1UL, xmm5 );
1203 C.store( i+SIMDSIZE , j+1UL, xmm6 );
1204 C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1205 C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1206 }
1207 else
1208 {
1209 const SIMDType zero;
1210 C.store( i , j , zero );
1211 C.store( i+SIMDSIZE , j , zero );
1212 C.store( i+SIMDSIZE*2UL, j , zero );
1213 C.store( i+SIMDSIZE*3UL, j , zero );
1214 C.store( i , j+1UL, zero );
1215 C.store( i+SIMDSIZE , j+1UL, zero );
1216 C.store( i+SIMDSIZE*2UL, j+1UL, zero );
1217 C.store( i+SIMDSIZE*3UL, j+1UL, zero );
1218 }
1219 }
1220
1221 if( j < jend )
1222 {
1223 const size_t kbegin( ( IsLower_v<MT5> )
1224 ?( ( IsUpper_v<MT4> )
1225 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1226 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1227 :( IsUpper_v<MT4> ? i : 0UL ) );
1228 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
1229
1230 size_t k( kbegin );
1231
1232 if( k < kend )
1233 {
1234 SIMDType b1( set( B(k,j) ) );
1235 SIMDType xmm1( A.load(i ,k) * b1 );
1236 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
1237 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
1238 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
1239
1240 for( ++k; k<kend; ++k ) {
1241 b1 = set( B(k,j) );
1242 xmm1 += A.load(i ,k) * b1;
1243 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1244 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1245 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1246 }
1247
1248 C.store( i , j, xmm1 );
1249 C.store( i+SIMDSIZE , j, xmm2 );
1250 C.store( i+SIMDSIZE*2UL, j, xmm3 );
1251 C.store( i+SIMDSIZE*3UL, j, xmm4 );
1252 }
1253 else
1254 {
1255 const SIMDType zero;
1256 C.store( i , j, zero );
1257 C.store( i+SIMDSIZE , j, zero );
1258 C.store( i+SIMDSIZE*2UL, j, zero );
1259 C.store( i+SIMDSIZE*3UL, j, zero );
1260 }
1261
1262 if( LOW ) ++j;
1263 }
1264
1265 if( LOW ) {
1266 const size_t iiend( min(i+SIMDSIZE*4UL,M) );
1267 for( ; j<N; ++j ) {
1268 for( size_t ii=i; ii<iiend; ++ii ) {
1269 reset( C(ii,j) );
1270 }
1271 }
1272 }
1273 }
1274
1275 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1276 {
1277 const size_t jend( LOW ? min(i+SIMDSIZE*3UL,N) : N );
1278 size_t j( 0UL );
1279
1280 if( SYM || HERM ) {
1281 const size_t iiend( min(i+SIMDSIZE*3UL,M) );
1282 for( ; j<i; ++j ) {
1283 for( size_t ii=i; ii<iiend; ++ii ) {
1284 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
1285 }
1286 }
1287 }
1288 else if( UPP ) {
1289 const size_t iiend( min(i+SIMDSIZE*3UL,M) );
1290 for( ; j<i; ++j ) {
1291 for( size_t ii=i; ii<iiend; ++ii ) {
1292 reset( C(ii,j) );
1293 }
1294 }
1295 }
1296
1297 for( ; (j+2UL) <= jend; j+=2UL )
1298 {
1299 const size_t kbegin( ( IsLower_v<MT5> )
1300 ?( ( IsUpper_v<MT4> )
1301 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1302 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1303 :( IsUpper_v<MT4> ? i : 0UL ) );
1304 const size_t kend( ( IsUpper_v<MT5> )
1305 ?( ( IsLower_v<MT4> )
1306 ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1307 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1308 :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
1309
1310 size_t k( kbegin );
1311
1312 if( k < kend )
1313 {
1314 SIMDType a1( A.load(i ,k) );
1315 SIMDType a2( A.load(i+SIMDSIZE ,k) );
1316 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1317 SIMDType b1( set( B(k,j ) ) );
1318 SIMDType b2( set( B(k,j+1UL) ) );
1319 SIMDType xmm1( a1 * b1 );
1320 SIMDType xmm2( a2 * b1 );
1321 SIMDType xmm3( a3 * b1 );
1322 SIMDType xmm4( a1 * b2 );
1323 SIMDType xmm5( a2 * b2 );
1324 SIMDType xmm6( a3 * b2 );
1325
1326 for( ++k; k<kend; ++k ) {
1327 a1 = A.load(i ,k);
1328 a2 = A.load(i+SIMDSIZE ,k);
1329 a3 = A.load(i+SIMDSIZE*2UL,k);
1330 b1 = set( B(k,j ) );
1331 b2 = set( B(k,j+1UL) );
1332 xmm1 += a1 * b1;
1333 xmm2 += a2 * b1;
1334 xmm3 += a3 * b1;
1335 xmm4 += a1 * b2;
1336 xmm5 += a2 * b2;
1337 xmm6 += a3 * b2;
1338 }
1339
1340 C.store( i , j , xmm1 );
1341 C.store( i+SIMDSIZE , j , xmm2 );
1342 C.store( i+SIMDSIZE*2UL, j , xmm3 );
1343 C.store( i , j+1UL, xmm4 );
1344 C.store( i+SIMDSIZE , j+1UL, xmm5 );
1345 C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
1346 }
1347 else
1348 {
1349 const SIMDType zero;
1350 C.store( i , j , zero );
1351 C.store( i+SIMDSIZE , j , zero );
1352 C.store( i+SIMDSIZE*2UL, j , zero );
1353 C.store( i , j+1UL, zero );
1354 C.store( i+SIMDSIZE , j+1UL, zero );
1355 C.store( i+SIMDSIZE*2UL, j+1UL, zero );
1356 }
1357 }
1358
1359 if( j < jend )
1360 {
1361 const size_t kbegin( ( IsLower_v<MT5> )
1362 ?( ( IsUpper_v<MT4> )
1363 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1364 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1365 :( IsUpper_v<MT4> ? i : 0UL ) );
1366 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
1367
1368 size_t k( kbegin );
1369
1370 if( k < kend )
1371 {
1372 SIMDType b1( set( B(k,j) ) );
1373 SIMDType xmm1( A.load(i ,k) * b1 );
1374 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
1375 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
1376
1377 for( ++k; k<kend; ++k ) {
1378 b1 = set( B(k,j) );
1379 xmm1 += A.load(i ,k) * b1;
1380 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1381 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1382 }
1383
1384 C.store( i , j, xmm1 );
1385 C.store( i+SIMDSIZE , j, xmm2 );
1386 C.store( i+SIMDSIZE*2UL, j, xmm3 );
1387 }
1388 else
1389 {
1390 const SIMDType zero;
1391 C.store( i , j, zero );
1392 C.store( i+SIMDSIZE , j, zero );
1393 C.store( i+SIMDSIZE*2UL, j, zero );
1394 }
1395
1396 if( LOW ) ++j;
1397 }
1398
1399 if( LOW ) {
1400 const size_t iiend( min(i+SIMDSIZE*3UL,M) );
1401 for( ; j<N; ++j ) {
1402 for( size_t ii=i; ii<iiend; ++ii ) {
1403 reset( C(ii,j) );
1404 }
1405 }
1406 }
1407 }
1408
1409 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1410 {
1411 const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
1412 size_t j( 0UL );
1413
1414 if( SYM || HERM ) {
1415 const size_t iiend( min(i+SIMDSIZE*2UL,M) );
1416 for( ; j<i; ++j ) {
1417 for( size_t ii=i; ii<iiend; ++ii ) {
1418 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
1419 }
1420 }
1421 }
1422 else if( UPP ) {
1423 const size_t iiend( min(i+SIMDSIZE*2UL,M) );
1424 for( ; j<i; ++j ) {
1425 for( size_t ii=i; ii<iiend; ++ii ) {
1426 reset( C(ii,j) );
1427 }
1428 }
1429 }
1430
1431 for( ; (j+4UL) <= jend; j+=4UL )
1432 {
1433 const size_t kbegin( ( IsLower_v<MT5> )
1434 ?( ( IsUpper_v<MT4> )
1435 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1436 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1437 :( IsUpper_v<MT4> ? i : 0UL ) );
1438 const size_t kend( ( IsUpper_v<MT5> )
1439 ?( ( IsLower_v<MT4> )
1440 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
1441 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
1442 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
1443
1444 size_t k( kbegin );
1445
1446 if( k < kend )
1447 {
1448 SIMDType a1( A.load(i ,k) );
1449 SIMDType a2( A.load(i+SIMDSIZE,k) );
1450 SIMDType b1( set( B(k,j ) ) );
1451 SIMDType b2( set( B(k,j+1UL) ) );
1452 SIMDType b3( set( B(k,j+2UL) ) );
1453 SIMDType b4( set( B(k,j+3UL) ) );
1454 SIMDType xmm1( a1 * b1 );
1455 SIMDType xmm2( a2 * b1 );
1456 SIMDType xmm3( a1 * b2 );
1457 SIMDType xmm4( a2 * b2 );
1458 SIMDType xmm5( a1 * b3 );
1459 SIMDType xmm6( a2 * b3 );
1460 SIMDType xmm7( a1 * b4 );
1461 SIMDType xmm8( a2 * b4 );
1462
1463 for( ++k; k<kend; ++k ) {
1464 a1 = A.load(i ,k);
1465 a2 = A.load(i+SIMDSIZE,k);
1466 b1 = set( B(k,j ) );
1467 b2 = set( B(k,j+1UL) );
1468 b3 = set( B(k,j+2UL) );
1469 b4 = set( B(k,j+3UL) );
1470 xmm1 += a1 * b1;
1471 xmm2 += a2 * b1;
1472 xmm3 += a1 * b2;
1473 xmm4 += a2 * b2;
1474 xmm5 += a1 * b3;
1475 xmm6 += a2 * b3;
1476 xmm7 += a1 * b4;
1477 xmm8 += a2 * b4;
1478 }
1479
1480 C.store( i , j , xmm1 );
1481 C.store( i+SIMDSIZE, j , xmm2 );
1482 C.store( i , j+1UL, xmm3 );
1483 C.store( i+SIMDSIZE, j+1UL, xmm4 );
1484 C.store( i , j+2UL, xmm5 );
1485 C.store( i+SIMDSIZE, j+2UL, xmm6 );
1486 C.store( i , j+3UL, xmm7 );
1487 C.store( i+SIMDSIZE, j+3UL, xmm8 );
1488 }
1489 else
1490 {
1491 const SIMDType zero;
1492 C.store( i , j , zero );
1493 C.store( i+SIMDSIZE, j , zero );
1494 C.store( i , j+1UL, zero );
1495 C.store( i+SIMDSIZE, j+1UL, zero );
1496 C.store( i , j+2UL, zero );
1497 C.store( i+SIMDSIZE, j+2UL, zero );
1498 C.store( i , j+3UL, zero );
1499 C.store( i+SIMDSIZE, j+3UL, zero );
1500 }
1501 }
1502
1503 for( ; (j+3UL) <= jend; j+=3UL )
1504 {
1505 const size_t kbegin( ( IsLower_v<MT5> )
1506 ?( ( IsUpper_v<MT4> )
1507 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1508 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1509 :( IsUpper_v<MT4> ? i : 0UL ) );
1510 const size_t kend( ( IsUpper_v<MT5> )
1511 ?( ( IsLower_v<MT4> )
1512 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
1513 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
1514 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
1515
1516 size_t k( kbegin );
1517
1518 if( k < kend )
1519 {
1520 SIMDType a1( A.load(i ,k) );
1521 SIMDType a2( A.load(i+SIMDSIZE,k) );
1522 SIMDType b1( set( B(k,j ) ) );
1523 SIMDType b2( set( B(k,j+1UL) ) );
1524 SIMDType b3( set( B(k,j+2UL) ) );
1525 SIMDType xmm1( a1 * b1 );
1526 SIMDType xmm2( a2 * b1 );
1527 SIMDType xmm3( a1 * b2 );
1528 SIMDType xmm4( a2 * b2 );
1529 SIMDType xmm5( a1 * b3 );
1530 SIMDType xmm6( a2 * b3 );
1531
1532 for( ++k; k<kend; ++k ) {
1533 a1 = A.load(i ,k);
1534 a2 = A.load(i+SIMDSIZE,k);
1535 b1 = set( B(k,j ) );
1536 b2 = set( B(k,j+1UL) );
1537 b3 = set( B(k,j+2UL) );
1538 xmm1 += a1 * b1;
1539 xmm2 += a2 * b1;
1540 xmm3 += a1 * b2;
1541 xmm4 += a2 * b2;
1542 xmm5 += a1 * b3;
1543 xmm6 += a2 * b3;
1544 }
1545
1546 C.store( i , j , xmm1 );
1547 C.store( i+SIMDSIZE, j , xmm2 );
1548 C.store( i , j+1UL, xmm3 );
1549 C.store( i+SIMDSIZE, j+1UL, xmm4 );
1550 C.store( i , j+2UL, xmm5 );
1551 C.store( i+SIMDSIZE, j+2UL, xmm6 );
1552 }
1553 else
1554 {
1555 const SIMDType zero;
1556 C.store( i , j , zero );
1557 C.store( i+SIMDSIZE, j , zero );
1558 C.store( i , j+1UL, zero );
1559 C.store( i+SIMDSIZE, j+1UL, zero );
1560 C.store( i , j+2UL, zero );
1561 C.store( i+SIMDSIZE, j+2UL, zero );
1562 }
1563 }
1564
1565 for( ; (j+2UL) <= jend; j+=2UL )
1566 {
1567 const size_t kbegin( ( IsLower_v<MT5> )
1568 ?( ( IsUpper_v<MT4> )
1569 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1570 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1571 :( IsUpper_v<MT4> ? i : 0UL ) );
1572 const size_t kend( ( IsUpper_v<MT5> )
1573 ?( ( IsLower_v<MT4> )
1574 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1575 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1576 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
1577
1578 size_t k( kbegin );
1579
1580 if( k < kend )
1581 {
1582 SIMDType a1( A.load(i ,k) );
1583 SIMDType a2( A.load(i+SIMDSIZE,k) );
1584 SIMDType b1( set( B(k,j ) ) );
1585 SIMDType b2( set( B(k,j+1UL) ) );
1586 SIMDType xmm1( a1 * b1 );
1587 SIMDType xmm2( a2 * b1 );
1588 SIMDType xmm3( a1 * b2 );
1589 SIMDType xmm4( a2 * b2 );
1590
1591 for( ++k; k<kend; ++k ) {
1592 a1 = A.load(i ,k);
1593 a2 = A.load(i+SIMDSIZE,k);
1594 b1 = set( B(k,j ) );
1595 b2 = set( B(k,j+1UL) );
1596 xmm1 += a1 * b1;
1597 xmm2 += a2 * b1;
1598 xmm3 += a1 * b2;
1599 xmm4 += a2 * b2;
1600 }
1601
1602 C.store( i , j , xmm1 );
1603 C.store( i+SIMDSIZE, j , xmm2 );
1604 C.store( i , j+1UL, xmm3 );
1605 C.store( i+SIMDSIZE, j+1UL, xmm4 );
1606 }
1607 else
1608 {
1609 const SIMDType zero;
1610 C.store( i , j , zero );
1611 C.store( i+SIMDSIZE, j , zero );
1612 C.store( i , j+1UL, zero );
1613 C.store( i+SIMDSIZE, j+1UL, zero );
1614 }
1615 }
1616
1617 if( j < jend )
1618 {
1619 const size_t kbegin( ( IsLower_v<MT5> )
1620 ?( ( IsUpper_v<MT4> )
1621 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1622 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1623 :( IsUpper_v<MT4> ? i : 0UL ) );
1624 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
1625
1626 size_t k( kbegin );
1627
1628 if( k < kend )
1629 {
1630 SIMDType b1( set( B(k,j) ) );
1631 SIMDType xmm1( A.load(i ,k) * b1 );
1632 SIMDType xmm2( A.load(i+SIMDSIZE,k) * b1 );
1633
1634 for( ++k; k<kend; ++k ) {
1635 b1 = set( B(k,j) );
1636 xmm1 += A.load(i ,k) * b1;
1637 xmm2 += A.load(i+SIMDSIZE,k) * b1;
1638 }
1639
1640 C.store( i , j, xmm1 );
1641 C.store( i+SIMDSIZE, j, xmm2 );
1642 }
1643 else
1644 {
1645 const SIMDType zero;
1646 C.store( i , j, zero );
1647 C.store( i+SIMDSIZE, j, zero );
1648 }
1649
1650 if( LOW ) ++j;
1651 }
1652
1653 if( LOW ) {
1654 const size_t iiend( min(i+SIMDSIZE*2UL,M) );
1655 for( ; j<N; ++j ) {
1656 for( size_t ii=i; ii<iiend; ++ii ) {
1657 reset( C(ii,j) );
1658 }
1659 }
1660 }
1661 }
1662
1663 for( ; i<ipos; i+=SIMDSIZE )
1664 {
1665 const size_t jend( LOW ? min(i+SIMDSIZE,N) : N );
1666 size_t j( 0UL );
1667
1668 if( SYM || HERM ) {
1669 const size_t iiend( min(i+SIMDSIZE,M) );
1670 for( ; j<i; ++j ) {
1671 for( size_t ii=i; ii<iiend; ++ii ) {
1672 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
1673 }
1674 }
1675 }
1676 else if( UPP ) {
1677 const size_t iiend( min(i+SIMDSIZE,M) );
1678 for( ; j<i; ++j ) {
1679 for( size_t ii=i; ii<iiend; ++ii ) {
1680 reset( C(ii,j) );
1681 }
1682 }
1683 }
1684
1685 for( ; (j+4UL) <= jend; j+=4UL )
1686 {
1687 const size_t kbegin( ( IsLower_v<MT5> )
1688 ?( ( IsUpper_v<MT4> )
1689 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1690 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1691 :( IsUpper_v<MT4> ? i : 0UL ) );
1692 const size_t kend( ( IsUpper_v<MT5> )
1693 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
1694 :( K ) );
1695
1696 size_t k( kbegin );
1697
1698 if( k < kend )
1699 {
1700 SIMDType a1( A.load(i,k) );
1701 SIMDType xmm1( a1 * set( B(k,j ) ) );
1702 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
1703 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
1704 SIMDType xmm4( a1 * set( B(k,j+3UL) ) );
1705
1706 for( ++k; k<kend; ++k ) {
1707 a1 = A.load(i,k);
1708 xmm1 += a1 * set( B(k,j ) );
1709 xmm2 += a1 * set( B(k,j+1UL) );
1710 xmm3 += a1 * set( B(k,j+2UL) );
1711 xmm4 += a1 * set( B(k,j+3UL) );
1712 }
1713
1714 C.store( i, j , xmm1 );
1715 C.store( i, j+1UL, xmm2 );
1716 C.store( i, j+2UL, xmm3 );
1717 C.store( i, j+3UL, xmm4 );
1718 }
1719 else
1720 {
1721 const SIMDType zero;
1722 C.store( i, j , zero );
1723 C.store( i, j+1UL, zero );
1724 C.store( i, j+2UL, zero );
1725 C.store( i, j+3UL, zero );
1726 }
1727 }
1728
1729 for( ; (j+3UL) <= jend; j+=3UL )
1730 {
1731 const size_t kbegin( ( IsLower_v<MT5> )
1732 ?( ( IsUpper_v<MT4> )
1733 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1734 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1735 :( IsUpper_v<MT4> ? i : 0UL ) );
1736 const size_t kend( ( IsUpper_v<MT5> )
1737 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
1738 :( K ) );
1739
1740 size_t k( kbegin );
1741
1742 if( k < kend )
1743 {
1744 SIMDType a1( A.load(i,k) );
1745 SIMDType xmm1( a1 * set( B(k,j ) ) );
1746 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
1747 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
1748
1749 for( ++k; k<kend; ++k ) {
1750 a1 = A.load(i,k);
1751 xmm1 += a1 * set( B(k,j ) );
1752 xmm2 += a1 * set( B(k,j+1UL) );
1753 xmm3 += a1 * set( B(k,j+2UL) );
1754 }
1755
1756 C.store( i, j , xmm1 );
1757 C.store( i, j+1UL, xmm2 );
1758 C.store( i, j+2UL, xmm3 );
1759 }
1760 else
1761 {
1762 const SIMDType zero;
1763 C.store( i, j , zero );
1764 C.store( i, j+1UL, zero );
1765 C.store( i, j+2UL, zero );
1766 }
1767 }
1768
1769 for( ; (j+2UL) <= jend; j+=2UL )
1770 {
1771 const size_t kbegin( ( IsLower_v<MT5> )
1772 ?( ( IsUpper_v<MT4> )
1773 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1774 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1775 :( IsUpper_v<MT4> ? i : 0UL ) );
1776 const size_t kend( ( IsUpper_v<MT5> )
1777 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
1778 :( K ) );
1779
1780 size_t k( kbegin );
1781
1782 if( k < kend )
1783 {
1784 SIMDType a1( A.load(i,k) );
1785 SIMDType xmm1( a1 * set( B(k,j ) ) );
1786 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
1787
1788 for( ++k; k<kend; ++k ) {
1789 a1 = A.load(i,k);
1790 xmm1 += a1 * set( B(k,j ) );
1791 xmm2 += a1 * set( B(k,j+1UL) );
1792 }
1793
1794 C.store( i, j , xmm1 );
1795 C.store( i, j+1UL, xmm2 );
1796 }
1797 else
1798 {
1799 const SIMDType zero;
1800 C.store( i, j , zero );
1801 C.store( i, j+1UL, zero );
1802 }
1803 }
1804
1805 if( j < jend )
1806 {
1807 const size_t kbegin( ( IsLower_v<MT5> )
1808 ?( ( IsUpper_v<MT4> )
1809 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1810 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1811 :( IsUpper_v<MT4> ? i : 0UL ) );
1812
1813 size_t k( kbegin );
1814
1815 if( k < K )
1816 {
1817 SIMDType xmm1( A.load(i,k) * set( B(k,j) ) );
1818
1819 for( ++k; k<K; ++k ) {
1820 xmm1 += A.load(i,k) * set( B(k,j) );
1821 }
1822
1823 C.store( i, j, xmm1 );
1824 }
1825 else
1826 {
1827 const SIMDType zero;
1828 C.store( i, j, zero );
1829 }
1830
1831 if( LOW ) ++j;
1832 }
1833
1834 if( LOW ) {
1835 const size_t iiend( min(i+SIMDSIZE,M) );
1836 for( ; j<N; ++j ) {
1837 for( size_t ii=i; ii<iiend; ++ii ) {
1838 reset( C(ii,j) );
1839 }
1840 }
1841 }
1842 }
1843
1844 for( ; remainder && i<M; ++i )
1845 {
1846 size_t j( 0UL );
1847
1848 if( SYM || HERM ) {
1849 for( ; j<i; ++j ) {
1850 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1851 }
1852 }
1853 else if( UPP ) {
1854 for( ; j<i; ++j ) {
1855 reset( C(i,j) );
1856 }
1857 }
1858
1859 for( ; (j+2UL) <= N; j+=2UL )
1860 {
1861 const size_t kbegin( ( IsLower_v<MT5> )
1862 ?( ( IsUpper_v<MT4> )
1863 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1864 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1865 :( IsUpper_v<MT4> ? i : 0UL ) );
1866 const size_t kend( ( IsUpper_v<MT5> )
1867 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
1868 :( K ) );
1869
1870 size_t k( kbegin );
1871
1872 if( k < kend )
1873 {
1874 ElementType value1( A(i,k) * B(k,j ) );
1875 ElementType value2( A(i,k) * B(k,j+1UL) );
1876
1877 for( ++k; k<kend; ++k ) {
1878 value1 += A(i,k) * B(k,j );
1879 value2 += A(i,k) * B(k,j+1UL);
1880 }
1881
1882 C(i,j ) = value1;
1883 C(i,j+1UL) = value2;
1884 }
1885 else
1886 {
1887 reset( C(i,j ) );
1888 reset( C(i,j+1UL) );
1889 }
1890 }
1891
1892 if( j < N )
1893 {
1894 const size_t kbegin( ( IsLower_v<MT5> )
1895 ?( ( IsUpper_v<MT4> )
1896 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1897 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1898 :( IsUpper_v<MT4> ? i : 0UL ) );
1899
1900 size_t k( kbegin );
1901
1902 if( k < K )
1903 {
1904 ElementType value( A(i,k) * B(k,j) );
1905
1906 for( ++k; k<K; ++k ) {
1907 value += A(i,k) * B(k,j);
1908 }
1909
1910 C(i,j) = value;
1911 }
1912 else
1913 {
1914 reset( C(i,j) );
1915 }
1916 }
1917 }
1918 }
1920 //**********************************************************************************************
1921
1922 //**Default assignment to dense matrices (large matrices)***************************************
1936 template< typename MT3 // Type of the left-hand side target matrix
1937 , typename MT4 // Type of the left-hand side matrix operand
1938 , typename MT5 > // Type of the right-hand side matrix operand
1939 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1940 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1941 {
1942 selectDefaultAssignKernel( C, A, B );
1943 }
1945 //**********************************************************************************************
1946
1947 //**Vectorized default assignment to dense matrices (large matrices)****************************
1962 template< typename MT3 // Type of the left-hand side target matrix
1963 , typename MT4 // Type of the left-hand side matrix operand
1964 , typename MT5 > // Type of the right-hand side matrix operand
1965 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1966 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1967 {
1968 if( SYM )
1969 smmm( C, A, B, ElementType(1) );
1970 else if( HERM )
1971 hmmm( C, A, B, ElementType(1) );
1972 else if( LOW )
1973 lmmm( C, A, B, ElementType(1), ElementType(0) );
1974 else if( UPP )
1975 ummm( C, A, B, ElementType(1), ElementType(0) );
1976 else
1977 mmm( C, A, B, ElementType(1), ElementType(0) );
1978 }
1980 //**********************************************************************************************
1981
1982 //**BLAS-based assignment to dense matrices (default)*******************************************
1996 template< typename MT3 // Type of the left-hand side target matrix
1997 , typename MT4 // Type of the left-hand side matrix operand
1998 , typename MT5 > // Type of the right-hand side matrix operand
1999 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2000 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2001 {
2002 selectLargeAssignKernel( C, A, B );
2003 }
2005 //**********************************************************************************************
2006
2007 //**BLAS-based assignment to dense matrices*****************************************************
2008#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2022 template< typename MT3 // Type of the left-hand side target matrix
2023 , typename MT4 // Type of the left-hand side matrix operand
2024 , typename MT5 > // Type of the right-hand side matrix operand
2025 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2026 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2027 {
2028 using ET = ElementType_t<MT3>;
2029
2030 if( IsTriangular_v<MT4> ) {
2031 assign( C, B );
2032 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2033 }
2034 else if( IsTriangular_v<MT5> ) {
2035 assign( C, A );
2036 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2037 }
2038 else {
2039 gemm( C, A, B, ET(1), ET(0) );
2040 }
2041 }
2043#endif
2044 //**********************************************************************************************
2045
2046 //**Assignment to sparse matrices***************************************************************
2059 template< typename MT // Type of the target sparse matrix
2060 , bool SO > // Storage order of the target sparse matrix
2061 friend inline auto assign( SparseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
2062 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2063 {
2065
2066 using TmpType = If_t< SO, ResultType, OppositeType >;
2067
2074
2075 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
2076 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
2077
2078 const ForwardFunctor fwd;
2079
2080 const TmpType tmp( serial( rhs ) );
2081 assign( *lhs, fwd( tmp ) );
2082 }
2084 //**********************************************************************************************
2085
2086 //**Restructuring assignment to row-major matrices**********************************************
2101 template< typename MT > // Type of the target matrix
2102 friend inline auto assign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
2103 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2104 {
2106
2108
2109 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
2110 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
2111
2112 const ForwardFunctor fwd;
2113
2114 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
2115 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
2116
2117 assign( *lhs, fwd( A * B ) );
2118 }
2120 //**********************************************************************************************
2121
2122 //**Addition assignment to dense matrices*******************************************************
2135 template< typename MT // Type of the target dense matrix
2136 , bool SO > // Storage order of the target dense matrix
2137 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
2138 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2139 {
2141
2142 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
2143 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
2144
2145 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2146 return;
2147 }
2148
2149 LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2150 RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2151
2152 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2153 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2154 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2155 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2156 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
2157 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
2158
2159 TDMatTDMatMultExpr::selectAddAssignKernel( *lhs, A, B );
2160 }
2162 //**********************************************************************************************
2163
2164 //**Addition assignment to dense matrices (kernel selection)************************************
2175 template< typename MT3 // Type of the left-hand side target matrix
2176 , typename MT4 // Type of the left-hand side matrix operand
2177 , typename MT5 > // Type of the right-hand side matrix operand
2178 static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2179 {
2180 if( ( IsDiagonal_v<MT4> ) ||
2181 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
2182 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
2183 selectSmallAddAssignKernel( C, A, B );
2184 else
2185 selectBlasAddAssignKernel( C, A, B );
2186 }
2188 //**********************************************************************************************
2189
2190 //**Default addition assignment to dense matrices (general/general)*****************************
2204 template< typename MT3 // Type of the left-hand side target matrix
2205 , typename MT4 // Type of the left-hand side matrix operand
2206 , typename MT5 > // Type of the right-hand side matrix operand
2207 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2208 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2209 {
2210 const size_t M( A.rows() );
2211 const size_t N( B.columns() );
2212 const size_t K( A.columns() );
2213
2214 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2215
2216 for( size_t j=0UL; j<N; ++j )
2217 {
2218 const size_t kbegin( ( IsLower_v<MT5> )
2219 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2220 :( 0UL ) );
2221 const size_t kend( ( IsUpper_v<MT5> )
2222 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2223 :( K ) );
2224 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2225
2226 for( size_t k=kbegin; k<kend; ++k )
2227 {
2228 const size_t ibegin( ( IsLower_v<MT4> )
2229 ?( ( IsStrictlyLower_v<MT4> )
2230 ?( LOW ? max(j,k+1UL) : k+1UL )
2231 :( LOW ? max(j,k) : k ) )
2232 :( LOW ? j : 0UL ) );
2233 const size_t iend( ( IsUpper_v<MT4> )
2234 ?( ( IsStrictlyUpper_v<MT4> )
2235 ?( UPP ? min(j+1UL,k) : k )
2236 :( UPP ? min(j,k)+1UL : k+1UL ) )
2237 :( UPP ? j+1UL : M ) );
2238
2239 if( ( LOW || UPP ) && ibegin >= iend ) continue;
2240 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2241
2242 const size_t inum( iend - ibegin );
2243 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
2244 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
2245
2246 for( size_t i=ibegin; i<ipos; i+=2UL ) {
2247 C(i ,j) += A(i ,k) * B(k,j);
2248 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2249 }
2250 if( ipos < iend ) {
2251 C(ipos,j) += A(ipos,k) * B(k,j);
2252 }
2253 }
2254 }
2255 }
2257 //**********************************************************************************************
2258
2259 //**Default addition assignment to dense matrices (general/diagonal)****************************
2273 template< typename MT3 // Type of the left-hand side target matrix
2274 , typename MT4 // Type of the left-hand side matrix operand
2275 , typename MT5 > // Type of the right-hand side matrix operand
2276 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2277 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2278 {
2280
2281 const size_t M( A.rows() );
2282 const size_t N( B.columns() );
2283
2284 for( size_t j=0UL; j<N; ++j )
2285 {
2286 const size_t ibegin( ( IsLower_v<MT4> )
2287 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
2288 :( 0UL ) );
2289 const size_t iend( ( IsUpper_v<MT4> )
2290 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
2291 :( M ) );
2292 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2293
2294 const size_t inum( iend - ibegin );
2295 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
2296 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
2297
2298 for( size_t i=ibegin; i<ipos; i+=2UL ) {
2299 C(i ,j) += A(i ,j) * B(j,j);
2300 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
2301 }
2302 if( ipos < iend ) {
2303 C(ipos,j) += A(ipos,j) * B(j,j);
2304 }
2305 }
2306 }
2308 //**********************************************************************************************
2309
2310 //**Default addition assignment to dense matrices (diagonal/general)****************************
2324 template< typename MT3 // Type of the left-hand side target matrix
2325 , typename MT4 // Type of the left-hand side matrix operand
2326 , typename MT5 > // Type of the right-hand side matrix operand
2327 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2328 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2329 {
2331
2332 const size_t M( A.rows() );
2333 const size_t N( B.columns() );
2334
2335 for( size_t j=0UL; j<N; ++j )
2336 {
2337 const size_t ibegin( ( IsLower_v<MT5> )
2338 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2339 :( 0UL ) );
2340 const size_t iend( ( IsUpper_v<MT5> )
2341 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2342 :( M ) );
2343 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2344
2345 const size_t inum( iend - ibegin );
2346 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
2347 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
2348
2349 for( size_t i=ibegin; i<ipos; i+=2UL ) {
2350 C(i ,j) += A(i ,i ) * B(i ,j);
2351 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2352 }
2353 if( ipos < iend ) {
2354 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
2355 }
2356 }
2357 }
2359 //**********************************************************************************************
2360
2361 //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2375 template< typename MT3 // Type of the left-hand side target matrix
2376 , typename MT4 // Type of the left-hand side matrix operand
2377 , typename MT5 > // Type of the right-hand side matrix operand
2378 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2379 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2380 {
2382
2383 for( size_t i=0UL; i<A.rows(); ++i ) {
2384 C(i,i) += A(i,i) * B(i,i);
2385 }
2386 }
2388 //**********************************************************************************************
2389
2390 //**Default addition assignment to dense matrices (small matrices)******************************
2404 template< typename MT3 // Type of the left-hand side target matrix
2405 , typename MT4 // Type of the left-hand side matrix operand
2406 , typename MT5 > // Type of the right-hand side matrix operand
2407 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2408 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2409 {
2410 selectDefaultAddAssignKernel( C, A, B );
2411 }
2413 //**********************************************************************************************
2414
2415 //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2430 template< typename MT3 // Type of the left-hand side target matrix
2431 , typename MT4 // Type of the left-hand side matrix operand
2432 , typename MT5 > // Type of the right-hand side matrix operand
2433 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2434 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2435 {
2440
2441 const ForwardFunctor fwd;
2442
2443 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
2444 const OppositeType_t<MT5> tmp( serial( B ) );
2445 addAssign( C, fwd( A * tmp ) );
2446 }
2447 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
2448 const OppositeType_t<MT4> tmp( serial( A ) );
2449 addAssign( C, fwd( tmp * B ) );
2450 }
2451 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2452 const OppositeType_t<MT5> tmp( serial( B ) );
2453 addAssign( C, fwd( A * tmp ) );
2454 }
2455 else {
2456 const OppositeType_t<MT4> tmp( serial( A ) );
2457 addAssign( C, fwd( tmp * B ) );
2458 }
2459 }
2461 //**********************************************************************************************
2462
2463 //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2478 template< typename MT3 // Type of the left-hand side target matrix
2479 , typename MT4 // Type of the left-hand side matrix operand
2480 , typename MT5 > // Type of the right-hand side matrix operand
2481 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2482 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2483 {
2484 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
2485
2486 const size_t M( A.rows() );
2487 const size_t N( B.columns() );
2488 const size_t K( A.columns() );
2489
2490 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2491
2492 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
2493 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
2494
2495 size_t i( 0UL );
2496
2497 if( IsIntegral_v<ElementType> )
2498 {
2499 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
2500 for( size_t j=0UL; j<N; ++j )
2501 {
2502 const size_t kbegin( ( IsLower_v<MT5> )
2503 ?( ( IsUpper_v<MT4> )
2504 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2505 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2506 :( IsUpper_v<MT4> ? i : 0UL ) );
2507 const size_t kend( ( IsUpper_v<MT5> )
2508 ?( ( IsLower_v<MT4> )
2509 ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2510 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
2511 :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
2512
2513 SIMDType xmm1( C.load(i ,j) );
2514 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
2515 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
2516 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
2517 SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
2518 SIMDType xmm6( C.load(i+SIMDSIZE*5UL,j) );
2519 SIMDType xmm7( C.load(i+SIMDSIZE*6UL,j) );
2520 SIMDType xmm8( C.load(i+SIMDSIZE*7UL,j) );
2521
2522 for( size_t k=kbegin; k<kend; ++k ) {
2523 const SIMDType b1( set( B(k,j) ) );
2524 xmm1 += A.load(i ,k) * b1;
2525 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2526 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2527 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2528 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2529 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
2530 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
2531 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
2532 }
2533
2534 C.store( i , j, xmm1 );
2535 C.store( i+SIMDSIZE , j, xmm2 );
2536 C.store( i+SIMDSIZE*2UL, j, xmm3 );
2537 C.store( i+SIMDSIZE*3UL, j, xmm4 );
2538 C.store( i+SIMDSIZE*4UL, j, xmm5 );
2539 C.store( i+SIMDSIZE*5UL, j, xmm6 );
2540 C.store( i+SIMDSIZE*6UL, j, xmm7 );
2541 C.store( i+SIMDSIZE*7UL, j, xmm8 );
2542 }
2543 }
2544 }
2545
2546 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
2547 {
2548 size_t j( 0UL );
2549
2550 for( ; (j+2UL) <= N; j+=2UL )
2551 {
2552 const size_t kbegin( ( IsLower_v<MT5> )
2553 ?( ( IsUpper_v<MT4> )
2554 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2555 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2556 :( IsUpper_v<MT4> ? i : 0UL ) );
2557 const size_t kend( ( IsUpper_v<MT5> )
2558 ?( ( IsLower_v<MT4> )
2559 ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2560 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2561 :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
2562
2563 SIMDType xmm1 ( C.load(i ,j ) );
2564 SIMDType xmm2 ( C.load(i+SIMDSIZE ,j ) );
2565 SIMDType xmm3 ( C.load(i+SIMDSIZE*2UL,j ) );
2566 SIMDType xmm4 ( C.load(i+SIMDSIZE*3UL,j ) );
2567 SIMDType xmm5 ( C.load(i+SIMDSIZE*4UL,j ) );
2568 SIMDType xmm6 ( C.load(i ,j+1UL) );
2569 SIMDType xmm7 ( C.load(i+SIMDSIZE ,j+1UL) );
2570 SIMDType xmm8 ( C.load(i+SIMDSIZE*2UL,j+1UL) );
2571 SIMDType xmm9 ( C.load(i+SIMDSIZE*3UL,j+1UL) );
2572 SIMDType xmm10( C.load(i+SIMDSIZE*4UL,j+1UL) );
2573
2574 for( size_t k=kbegin; k<kend; ++k ) {
2575 const SIMDType a1( A.load(i ,k) );
2576 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2577 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2578 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2579 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
2580 const SIMDType b1( set( B(k,j ) ) );
2581 const SIMDType b2( set( B(k,j+1UL) ) );
2582 xmm1 += a1 * b1;
2583 xmm2 += a2 * b1;
2584 xmm3 += a3 * b1;
2585 xmm4 += a4 * b1;
2586 xmm5 += a5 * b1;
2587 xmm6 += a1 * b2;
2588 xmm7 += a2 * b2;
2589 xmm8 += a3 * b2;
2590 xmm9 += a4 * b2;
2591 xmm10 += a5 * b2;
2592 }
2593
2594 C.store( i , j , xmm1 );
2595 C.store( i+SIMDSIZE , j , xmm2 );
2596 C.store( i+SIMDSIZE*2UL, j , xmm3 );
2597 C.store( i+SIMDSIZE*3UL, j , xmm4 );
2598 C.store( i+SIMDSIZE*4UL, j , xmm5 );
2599 C.store( i , j+1UL, xmm6 );
2600 C.store( i+SIMDSIZE , j+1UL, xmm7 );
2601 C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
2602 C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
2603 C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
2604 }
2605
2606 if( j < N )
2607 {
2608 const size_t kbegin( ( IsLower_v<MT5> )
2609 ?( ( IsUpper_v<MT4> )
2610 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2611 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2612 :( IsUpper_v<MT4> ? i : 0UL ) );
2613 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
2614
2615 SIMDType xmm1( C.load(i ,j) );
2616 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
2617 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
2618 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
2619 SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
2620
2621 for( size_t k=kbegin; k<kend; ++k ) {
2622 const SIMDType b1( set( B(k,j) ) );
2623 xmm1 += A.load(i ,k) * b1;
2624 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2625 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2626 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2627 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2628 }
2629
2630 C.store( i , j, xmm1 );
2631 C.store( i+SIMDSIZE , j, xmm2 );
2632 C.store( i+SIMDSIZE*2UL, j, xmm3 );
2633 C.store( i+SIMDSIZE*3UL, j, xmm4 );
2634 C.store( i+SIMDSIZE*4UL, j, xmm5 );
2635 }
2636 }
2637
2638 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2639 {
2640 size_t j( 0UL );
2641
2642 for( ; (j+2UL) <= N; j+=2UL )
2643 {
2644 const size_t kbegin( ( IsLower_v<MT5> )
2645 ?( ( IsUpper_v<MT4> )
2646 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2647 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2648 :( IsUpper_v<MT4> ? i : 0UL ) );
2649 const size_t kend( ( IsUpper_v<MT5> )
2650 ?( ( IsLower_v<MT4> )
2651 ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2652 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2653 :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
2654
2655 SIMDType xmm1( C.load(i ,j ) );
2656 SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
2657 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
2658 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j ) );
2659 SIMDType xmm5( C.load(i ,j+1UL) );
2660 SIMDType xmm6( C.load(i+SIMDSIZE ,j+1UL) );
2661 SIMDType xmm7( C.load(i+SIMDSIZE*2UL,j+1UL) );
2662 SIMDType xmm8( C.load(i+SIMDSIZE*3UL,j+1UL) );
2663
2664 for( size_t k=kbegin; k<kend; ++k ) {
2665 const SIMDType a1( A.load(i ,k) );
2666 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2667 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2668 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2669 const SIMDType b1( set( B(k,j ) ) );
2670 const SIMDType b2( set( B(k,j+1UL) ) );
2671 xmm1 += a1 * b1;
2672 xmm2 += a2 * b1;
2673 xmm3 += a3 * b1;
2674 xmm4 += a4 * b1;
2675 xmm5 += a1 * b2;
2676 xmm6 += a2 * b2;
2677 xmm7 += a3 * b2;
2678 xmm8 += a4 * b2;
2679 }
2680
2681 C.store( i , j , xmm1 );
2682 C.store( i+SIMDSIZE , j , xmm2 );
2683 C.store( i+SIMDSIZE*2UL, j , xmm3 );
2684 C.store( i+SIMDSIZE*3UL, j , xmm4 );
2685 C.store( i , j+1UL, xmm5 );
2686 C.store( i+SIMDSIZE , j+1UL, xmm6 );
2687 C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
2688 C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
2689 }
2690
2691 if( j < N )
2692 {
2693 const size_t kbegin( ( IsLower_v<MT5> )
2694 ?( ( IsUpper_v<MT4> )
2695 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2696 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2697 :( IsUpper_v<MT4> ? i : 0UL ) );
2698 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
2699
2700 SIMDType xmm1( C.load(i ,j) );
2701 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
2702 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
2703 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
2704
2705 for( size_t k=kbegin; k<kend; ++k ) {
2706 const SIMDType b1( set( B(k,j) ) );
2707 xmm1 += A.load(i ,k) * b1;
2708 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2709 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2710 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2711 }
2712
2713 C.store( i , j, xmm1 );
2714 C.store( i+SIMDSIZE , j, xmm2 );
2715 C.store( i+SIMDSIZE*2UL, j, xmm3 );
2716 C.store( i+SIMDSIZE*3UL, j, xmm4 );
2717 }
2718 }
2719
2720 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2721 {
2722 size_t j( 0UL );
2723
2724 for( ; (j+2UL) <= N; j+=2UL )
2725 {
2726 const size_t kbegin( ( IsLower_v<MT5> )
2727 ?( ( IsUpper_v<MT4> )
2728 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2729 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2730 :( IsUpper_v<MT4> ? i : 0UL ) );
2731 const size_t kend( ( IsUpper_v<MT5> )
2732 ?( ( IsLower_v<MT4> )
2733 ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2734 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2735 :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
2736
2737 SIMDType xmm1( C.load(i ,j ) );
2738 SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
2739 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
2740 SIMDType xmm4( C.load(i ,j+1UL) );
2741 SIMDType xmm5( C.load(i+SIMDSIZE ,j+1UL) );
2742 SIMDType xmm6( C.load(i+SIMDSIZE*2UL,j+1UL) );
2743
2744 for( size_t k=kbegin; k<kend; ++k ) {
2745 const SIMDType a1( A.load(i ,k) );
2746 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2747 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2748 const SIMDType b1( set( B(k,j ) ) );
2749 const SIMDType b2( set( B(k,j+1UL) ) );
2750 xmm1 += a1 * b1;
2751 xmm2 += a2 * b1;
2752 xmm3 += a3 * b1;
2753 xmm4 += a1 * b2;
2754 xmm5 += a2 * b2;
2755 xmm6 += a3 * b2;
2756 }
2757
2758 C.store( i , j , xmm1 );
2759 C.store( i+SIMDSIZE , j , xmm2 );
2760 C.store( i+SIMDSIZE*2UL, j , xmm3 );
2761 C.store( i , j+1UL, xmm4 );
2762 C.store( i+SIMDSIZE , j+1UL, xmm5 );
2763 C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2764 }
2765
2766 if( j < N )
2767 {
2768 const size_t kbegin( ( IsLower_v<MT5> )
2769 ?( ( IsUpper_v<MT4> )
2770 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2771 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2772 :( IsUpper_v<MT4> ? i : 0UL ) );
2773 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
2774
2775 SIMDType xmm1( C.load(i ,j) );
2776 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
2777 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
2778
2779 for( size_t k=kbegin; k<kend; ++k ) {
2780 const SIMDType b1( set( B(k,j) ) );
2781 xmm1 += A.load(i ,k) * b1;
2782 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2783 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2784 }
2785
2786 C.store( i , j, xmm1 );
2787 C.store( i+SIMDSIZE , j, xmm2 );
2788 C.store( i+SIMDSIZE*2UL, j, xmm3 );
2789 }
2790 }
2791
2792 for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2793 {
2794 const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
2795 size_t j( UPP ? i : 0UL );
2796
2797 for( ; (j+4UL) <= jend; j+=4UL )
2798 {
2799 const size_t kbegin( ( IsLower_v<MT5> )
2800 ?( ( IsUpper_v<MT4> )
2801 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2802 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2803 :( IsUpper_v<MT4> ? i : 0UL ) );
2804 const size_t kend( ( IsUpper_v<MT5> )
2805 ?( ( IsLower_v<MT4> )
2806 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
2807 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
2808 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2809
2810 SIMDType xmm1( C.load(i ,j ) );
2811 SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
2812 SIMDType xmm3( C.load(i ,j+1UL) );
2813 SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
2814 SIMDType xmm5( C.load(i ,j+2UL) );
2815 SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
2816 SIMDType xmm7( C.load(i ,j+3UL) );
2817 SIMDType xmm8( C.load(i+SIMDSIZE,j+3UL) );
2818
2819 for( size_t k=kbegin; k<kend; ++k ) {
2820 const SIMDType a1( A.load(i ,k) );
2821 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2822 const SIMDType b1( set( B(k,j ) ) );
2823 const SIMDType b2( set( B(k,j+1UL) ) );
2824 const SIMDType b3( set( B(k,j+2UL) ) );
2825 const SIMDType b4( set( B(k,j+3UL) ) );
2826 xmm1 += a1 * b1;
2827 xmm2 += a2 * b1;
2828 xmm3 += a1 * b2;
2829 xmm4 += a2 * b2;
2830 xmm5 += a1 * b3;
2831 xmm6 += a2 * b3;
2832 xmm7 += a1 * b4;
2833 xmm8 += a2 * b4;
2834 }
2835
2836 C.store( i , j , xmm1 );
2837 C.store( i+SIMDSIZE, j , xmm2 );
2838 C.store( i , j+1UL, xmm3 );
2839 C.store( i+SIMDSIZE, j+1UL, xmm4 );
2840 C.store( i , j+2UL, xmm5 );
2841 C.store( i+SIMDSIZE, j+2UL, xmm6 );
2842 C.store( i , j+3UL, xmm7 );
2843 C.store( i+SIMDSIZE, j+3UL, xmm8 );
2844 }
2845
2846 for( ; (j+3UL) <= jend; j+=3UL )
2847 {
2848 const size_t kbegin( ( IsLower_v<MT5> )
2849 ?( ( IsUpper_v<MT4> )
2850 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2851 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2852 :( IsUpper_v<MT4> ? i : 0UL ) );
2853 const size_t kend( ( IsUpper_v<MT5> )
2854 ?( ( IsLower_v<MT4> )
2855 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
2856 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
2857 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2858
2859 SIMDType xmm1( C.load(i ,j ) );
2860 SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
2861 SIMDType xmm3( C.load(i ,j+1UL) );
2862 SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
2863 SIMDType xmm5( C.load(i ,j+2UL) );
2864 SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
2865
2866 for( size_t k=kbegin; k<kend; ++k ) {
2867 const SIMDType a1( A.load(i ,k) );
2868 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2869 const SIMDType b1( set( B(k,j ) ) );
2870 const SIMDType b2( set( B(k,j+1UL) ) );
2871 const SIMDType b3( set( B(k,j+2UL) ) );
2872 xmm1 += a1 * b1;
2873 xmm2 += a2 * b1;
2874 xmm3 += a1 * b2;
2875 xmm4 += a2 * b2;
2876 xmm5 += a1 * b3;
2877 xmm6 += a2 * b3;
2878 }
2879
2880 C.store( i , j , xmm1 );
2881 C.store( i+SIMDSIZE, j , xmm2 );
2882 C.store( i , j+1UL, xmm3 );
2883 C.store( i+SIMDSIZE, j+1UL, xmm4 );
2884 C.store( i , j+2UL, xmm5 );
2885 C.store( i+SIMDSIZE, j+2UL, xmm6 );
2886 }
2887
2888 for( ; (j+2UL) <= jend; j+=2UL )
2889 {
2890 const size_t kbegin( ( IsLower_v<MT5> )
2891 ?( ( IsUpper_v<MT4> )
2892 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2893 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2894 :( IsUpper_v<MT4> ? i : 0UL ) );
2895 const size_t kend( ( IsUpper_v<MT5> )
2896 ?( ( IsLower_v<MT4> )
2897 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2898 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2899 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2900
2901 SIMDType xmm1( C.load(i ,j ) );
2902 SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
2903 SIMDType xmm3( C.load(i ,j+1UL) );
2904 SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
2905
2906 for( size_t k=kbegin; k<kend; ++k ) {
2907 const SIMDType a1( A.load(i ,k) );
2908 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2909 const SIMDType b1( set( B(k,j ) ) );
2910 const SIMDType b2( set( B(k,j+1UL) ) );
2911 xmm1 += a1 * b1;
2912 xmm2 += a2 * b1;
2913 xmm3 += a1 * b2;
2914 xmm4 += a2 * b2;
2915 }
2916
2917 C.store( i , j , xmm1 );
2918 C.store( i+SIMDSIZE, j , xmm2 );
2919 C.store( i , j+1UL, xmm3 );
2920 C.store( i+SIMDSIZE, j+1UL, xmm4 );
2921 }
2922
2923 if( j < jend )
2924 {
2925 const size_t kbegin( ( IsLower_v<MT5> )
2926 ?( ( IsUpper_v<MT4> )
2927 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2928 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2929 :( IsUpper_v<MT4> ? i : 0UL ) );
2930 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
2931
2932 SIMDType xmm1( C.load(i ,j) );
2933 SIMDType xmm2( C.load(i+SIMDSIZE,j) );
2934
2935 for( size_t k=kbegin; k<kend; ++k ) {
2936 const SIMDType b1( set( B(k,j) ) );
2937 xmm1 += A.load(i ,k) * b1;
2938 xmm2 += A.load(i+SIMDSIZE,k) * b1;
2939 }
2940
2941 C.store( i , j, xmm1 );
2942 C.store( i+SIMDSIZE, j, xmm2 );
2943 }
2944 }
2945
2946 for( ; i<ipos; i+=SIMDSIZE )
2947 {
2948 const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
2949 size_t j( UPP ? i : 0UL );
2950
2951 for( ; (j+4UL) <= jend; j+=4UL )
2952 {
2953 const size_t kbegin( ( IsLower_v<MT5> )
2954 ?( ( IsUpper_v<MT4> )
2955 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2956 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2957 :( IsUpper_v<MT4> ? i : 0UL ) );
2958 const size_t kend( ( IsUpper_v<MT5> )
2959 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
2960 :( K ) );
2961
2962 SIMDType xmm1( C.load(i,j ) );
2963 SIMDType xmm2( C.load(i,j+1UL) );
2964 SIMDType xmm3( C.load(i,j+2UL) );
2965 SIMDType xmm4( C.load(i,j+3UL) );
2966
2967 for( size_t k=kbegin; k<kend; ++k ) {
2968 const SIMDType a1( A.load(i,k) );
2969 xmm1 += a1 * set( B(k,j ) );
2970 xmm2 += a1 * set( B(k,j+1UL) );
2971 xmm3 += a1 * set( B(k,j+2UL) );
2972 xmm4 += a1 * set( B(k,j+3UL) );
2973 }
2974
2975 C.store( i, j , xmm1 );
2976 C.store( i, j+1UL, xmm2 );
2977 C.store( i, j+2UL, xmm3 );
2978 C.store( i, j+3UL, xmm4 );
2979 }
2980
2981 for( ; (j+3UL) <= jend; j+=3UL )
2982 {
2983 const size_t kbegin( ( IsLower_v<MT5> )
2984 ?( ( IsUpper_v<MT4> )
2985 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2986 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2987 :( IsUpper_v<MT4> ? i : 0UL ) );
2988 const size_t kend( ( IsUpper_v<MT5> )
2989 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
2990 :( K ) );
2991
2992 SIMDType xmm1( C.load(i,j ) );
2993 SIMDType xmm2( C.load(i,j+1UL) );
2994 SIMDType xmm3( C.load(i,j+2UL) );
2995
2996 for( size_t k=kbegin; k<kend; ++k ) {
2997 const SIMDType a1( A.load(i,k) );
2998 xmm1 += a1 * set( B(k,j ) );
2999 xmm2 += a1 * set( B(k,j+1UL) );
3000 xmm3 += a1 * set( B(k,j+2UL) );
3001 }
3002
3003 C.store( i, j , xmm1 );
3004 C.store( i, j+1UL, xmm2 );
3005 C.store( i, j+2UL, xmm3 );
3006 }
3007
3008 for( ; (j+2UL) <= jend; j+=2UL )
3009 {
3010 const size_t kbegin( ( IsLower_v<MT5> )
3011 ?( ( IsUpper_v<MT4> )
3012 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3013 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3014 :( IsUpper_v<MT4> ? i : 0UL ) );
3015 const size_t kend( ( IsUpper_v<MT5> )
3016 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
3017 :( K ) );
3018
3019 SIMDType xmm1( C.load(i,j ) );
3020 SIMDType xmm2( C.load(i,j+1UL) );
3021
3022 for( size_t k=kbegin; k<kend; ++k ) {
3023 const SIMDType a1( A.load(i,k) );
3024 xmm1 += a1 * set( B(k,j ) );
3025 xmm2 += a1 * set( B(k,j+1UL) );
3026 }
3027
3028 C.store( i, j , xmm1 );
3029 C.store( i, j+1UL, xmm2 );
3030 }
3031
3032 if( j < jend )
3033 {
3034 const size_t kbegin( ( IsLower_v<MT5> )
3035 ?( ( IsUpper_v<MT4> )
3036 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3037 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3038 :( IsUpper_v<MT4> ? i : 0UL ) );
3039
3040 SIMDType xmm1( C.load(i,j) );
3041
3042 for( size_t k=kbegin; k<K; ++k ) {
3043 xmm1 += A.load(i,k) * set( B(k,j) );
3044 }
3045
3046 C.store( i, j, xmm1 );
3047 }
3048 }
3049
3050 for( ; remainder && i<M; ++i )
3051 {
3052 const size_t jend( LOW ? i+1UL : N );
3053 size_t j( UPP ? i : 0UL );
3054
3055 for( ; (j+2UL) <= jend; j+=2UL )
3056 {
3057 const size_t kbegin( ( IsLower_v<MT5> )
3058 ?( ( IsUpper_v<MT4> )
3059 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3060 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3061 :( IsUpper_v<MT4> ? i : 0UL ) );
3062 const size_t kend( ( IsUpper_v<MT5> )
3063 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
3064 :( K ) );
3065
3066 ElementType value1( C(i,j ) );
3067 ElementType value2( C(i,j+1UL) );
3068
3069 for( size_t k=kbegin; k<kend; ++k ) {
3070 value1 += A(i,k) * B(k,j );
3071 value2 += A(i,k) * B(k,j+1UL);
3072 }
3073
3074 C(i,j ) = value1;
3075 C(i,j+1UL) = value2;
3076 }
3077
3078 if( j < jend )
3079 {
3080 const size_t kbegin( ( IsLower_v<MT5> )
3081 ?( ( IsUpper_v<MT4> )
3082 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3083 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3084 :( IsUpper_v<MT4> ? i : 0UL ) );
3085
3086 ElementType value( C(i,j) );
3087
3088 for( size_t k=kbegin; k<K; ++k ) {
3089 value += A(i,k) * B(k,j);
3090 }
3091
3092 C(i,j) = value;
3093 }
3094 }
3095 }
3097 //**********************************************************************************************
3098
3099 //**Default addition assignment to dense matrices (large matrices)******************************
3113 template< typename MT3 // Type of the left-hand side target matrix
3114 , typename MT4 // Type of the left-hand side matrix operand
3115 , typename MT5 > // Type of the right-hand side matrix operand
3116 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3117 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3118 {
3119 selectDefaultAddAssignKernel( C, A, B );
3120 }
3122 //**********************************************************************************************
3123
3124 //**Vectorized default addition assignment to dense matrices (large matrices)*******************
3139 template< typename MT3 // Type of the left-hand side target matrix
3140 , typename MT4 // Type of the left-hand side matrix operand
3141 , typename MT5 > // Type of the right-hand side matrix operand
3142 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3143 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3144 {
3145 if( LOW )
3146 lmmm( C, A, B, ElementType(1), ElementType(1) );
3147 else if( UPP )
3148 ummm( C, A, B, ElementType(1), ElementType(1) );
3149 else
3150 mmm( C, A, B, ElementType(1), ElementType(1) );
3151 }
3153 //**********************************************************************************************
3154
3155 //**BLAS-based addition assignment to dense matrices (default)**********************************
3169 template< typename MT3 // Type of the left-hand side target matrix
3170 , typename MT4 // Type of the left-hand side matrix operand
3171 , typename MT5 > // Type of the right-hand side matrix operand
3172 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3173 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3174 {
3175 selectLargeAddAssignKernel( C, A, B );
3176 }
3178 //**********************************************************************************************
3179
3180 //**BLAS-based addition assignment to dense matrices********************************************
3181#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3195 template< typename MT3 // Type of the left-hand side target matrix
3196 , typename MT4 // Type of the left-hand side matrix operand
3197 , typename MT5 > // Type of the right-hand side matrix operand
3198 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3199 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3200 {
3201 using ET = ElementType_t<MT3>;
3202
3203 if( IsTriangular_v<MT4> ) {
3204 ResultType_t<MT3> tmp( serial( B ) );
3205 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
3206 addAssign( C, tmp );
3207 }
3208 else if( IsTriangular_v<MT5> ) {
3209 ResultType_t<MT3> tmp( serial( A ) );
3210 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
3211 addAssign( C, tmp );
3212 }
3213 else {
3214 gemm( C, A, B, ET(1), ET(1) );
3215 }
3216 }
3218#endif
3219 //**********************************************************************************************
3220
3221 //**Restructuring addition assignment to row-major matrices*************************************
3236 template< typename MT > // Type of the target matrix
3237 friend inline auto addAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
3238 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3239 {
3241
3243
3244 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
3245 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
3246
3247 const ForwardFunctor fwd;
3248
3249 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
3250 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
3251
3252 addAssign( *lhs, fwd( A * B ) );
3253 }
3255 //**********************************************************************************************
3256
3257 //**Addition assignment to sparse matrices******************************************************
3258 // No special implementation for the addition assignment to sparse matrices.
3259 //**********************************************************************************************
3260
3261 //**Subtraction assignment to dense matrices****************************************************
3274 template< typename MT // Type of the target dense matrix
3275 , bool SO > // Storage order of the target dense matrix
3276 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3277 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3278 {
3280
3281 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
3282 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
3283
3284 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3285 return;
3286 }
3287
3288 LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3289 RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3290
3291 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3292 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3293 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3294 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3295 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
3296 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
3297
3298 TDMatTDMatMultExpr::selectSubAssignKernel( *lhs, A, B );
3299 }
3301 //**********************************************************************************************
3302
3303 //**Subtraction assignment to dense matrices (kernel selection)*********************************
3314 template< typename MT3 // Type of the left-hand side target matrix
3315 , typename MT4 // Type of the left-hand side matrix operand
3316 , typename MT5 > // Type of the right-hand side matrix operand
3317 static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3318 {
3319 if( ( IsDiagonal_v<MT4> ) ||
3320 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
3321 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
3322 selectSmallSubAssignKernel( C, A, B );
3323 else
3324 selectBlasSubAssignKernel( C, A, B );
3325 }
3327 //**********************************************************************************************
3328
3329 //**Default subtraction assignment to dense matrices (general/general)**************************
3343 template< typename MT3 // Type of the left-hand side target matrix
3344 , typename MT4 // Type of the left-hand side matrix operand
3345 , typename MT5 > // Type of the right-hand side matrix operand
3346 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3347 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3348 {
3349 const size_t M( A.rows() );
3350 const size_t N( B.columns() );
3351 const size_t K( A.columns() );
3352
3353 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3354
3355 for( size_t j=0UL; j<N; ++j )
3356 {
3357 const size_t kbegin( ( IsLower_v<MT5> )
3358 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3359 :( 0UL ) );
3360 const size_t kend( ( IsUpper_v<MT5> )
3361 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3362 :( K ) );
3363 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3364
3365 for( size_t k=kbegin; k<kend; ++k )
3366 {
3367 const size_t ibegin( ( IsLower_v<MT4> )
3368 ?( ( IsStrictlyLower_v<MT4> )
3369 ?( LOW ? max(j,k+1UL) : k+1UL )
3370 :( LOW ? max(j,k) : k ) )
3371 :( LOW ? j : 0UL ) );
3372 const size_t iend( ( IsUpper_v<MT4> )
3373 ?( ( IsStrictlyUpper_v<MT4> )
3374 ?( UPP ? min(j+1UL,k) : k )
3375 :( UPP ? min(j,k)+1UL : k+1UL ) )
3376 :( UPP ? j+1UL : M ) );
3377
3378 if( ( LOW || UPP ) && ( ibegin >= iend ) ) continue;
3379 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3380
3381 const size_t inum( iend - ibegin );
3382 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
3383 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
3384
3385 for( size_t i=ibegin; i<ipos; i+=2UL ) {
3386 C(i ,j) -= A(i ,k) * B(k,j);
3387 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3388 }
3389 if( ipos < iend ) {
3390 C(ipos,j) -= A(ipos,k) * B(k,j);
3391 }
3392 }
3393 }
3394 }
3396 //**********************************************************************************************
3397
3398 //**Default subtraction assignment to dense matrices (general/diagonal)*************************
3412 template< typename MT3 // Type of the left-hand side target matrix
3413 , typename MT4 // Type of the left-hand side matrix operand
3414 , typename MT5 > // Type of the right-hand side matrix operand
3415 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3416 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3417 {
3419
3420 const size_t M( A.rows() );
3421 const size_t N( B.columns() );
3422
3423 for( size_t j=0UL; j<N; ++j )
3424 {
3425 const size_t ibegin( ( IsLower_v<MT4> )
3426 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
3427 :( 0UL ) );
3428 const size_t iend( ( IsUpper_v<MT4> )
3429 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
3430 :( M ) );
3431 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3432
3433 const size_t inum( iend - ibegin );
3434 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
3435 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
3436
3437 for( size_t i=ibegin; i<ipos; i+=2UL ) {
3438 C(i ,j) -= A(i ,j) * B(j,j);
3439 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
3440 }
3441 if( ipos < iend ) {
3442 C(ipos,j) -= A(ipos,j) * B(j,j);
3443 }
3444 }
3445 }
3447 //**********************************************************************************************
3448
3449 //**Default subtraction assignment to dense matrices (diagonal/general)*************************
3463 template< typename MT3 // Type of the left-hand side target matrix
3464 , typename MT4 // Type of the left-hand side matrix operand
3465 , typename MT5 > // Type of the right-hand side matrix operand
3466 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3467 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3468 {
3470
3471 const size_t M( A.rows() );
3472 const size_t N( B.columns() );
3473
3474 for( size_t j=0UL; j<N; ++j )
3475 {
3476 const size_t ibegin( ( IsLower_v<MT5> )
3477 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3478 :( 0UL ) );
3479 const size_t iend( ( IsUpper_v<MT5> )
3480 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3481 :( M ) );
3482 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3483
3484 const size_t inum( iend - ibegin );
3485 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
3486 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
3487
3488 for( size_t i=ibegin; i<ipos; i+=2UL ) {
3489 C(i ,j) -= A(i ,i ) * B(i ,j);
3490 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3491 }
3492 if( ipos < iend ) {
3493 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3494 }
3495 }
3496 }
3498 //**********************************************************************************************
3499
3500 //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
3514 template< typename MT3 // Type of the left-hand side target matrix
3515 , typename MT4 // Type of the left-hand side matrix operand
3516 , typename MT5 > // Type of the right-hand side matrix operand
3517 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3518 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3519 {
3521
3522 for( size_t i=0UL; i<A.rows(); ++i ) {
3523 C(i,i) -= A(i,i) * B(i,i);
3524 }
3525 }
3527 //**********************************************************************************************
3528
3529 //**Default subtraction assignment to dense matrices (small matrices)***************************
3543 template< typename MT3 // Type of the left-hand side target matrix
3544 , typename MT4 // Type of the left-hand side matrix operand
3545 , typename MT5 > // Type of the right-hand side matrix operand
3546 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3547 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3548 {
3549 selectDefaultSubAssignKernel( C, A, B );
3550 }
3552 //**********************************************************************************************
3553
3554 //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
3569 template< typename MT3 // Type of the left-hand side target matrix
3570 , typename MT4 // Type of the left-hand side matrix operand
3571 , typename MT5 > // Type of the right-hand side matrix operand
3572 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3573 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3574 {
3579
3580 const ForwardFunctor fwd;
3581
3582 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
3583 const OppositeType_t<MT5> tmp( serial( B ) );
3584 subAssign( C, fwd( A * tmp ) );
3585 }
3586 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
3587 const OppositeType_t<MT4> tmp( serial( A ) );
3588 subAssign( C, fwd( tmp * B ) );
3589 }
3590 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3591 const OppositeType_t<MT5> tmp( serial( B ) );
3592 subAssign( C, fwd( A * tmp ) );
3593 }
3594 else {
3595 const OppositeType_t<MT4> tmp( serial( A ) );
3596 subAssign( C, fwd( tmp * B ) );
3597 }
3598 }
3600 //**********************************************************************************************
3601
3602 //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
3617 template< typename MT3 // Type of the left-hand side target matrix
3618 , typename MT4 // Type of the left-hand side matrix operand
3619 , typename MT5 > // Type of the right-hand side matrix operand
3620 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3621 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3622 {
3623 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
3624
3625 const size_t M( A.rows() );
3626 const size_t N( B.columns() );
3627 const size_t K( A.columns() );
3628
3629 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3630
3631 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
3632 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
3633
3634 size_t i( 0UL );
3635
3636 if( IsIntegral_v<ElementType> )
3637 {
3638 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3639 for( size_t j=0UL; j<N; ++j )
3640 {
3641 const size_t kbegin( ( IsLower_v<MT5> )
3642 ?( ( IsUpper_v<MT4> )
3643 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3644 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3645 :( IsUpper_v<MT4> ? i : 0UL ) );
3646 const size_t kend( ( IsUpper_v<MT5> )
3647 ?( ( IsLower_v<MT4> )
3648 ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3649 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
3650 :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
3651
3652 SIMDType xmm1( C.load(i ,j) );
3653 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3654 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3655 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3656 SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
3657 SIMDType xmm6( C.load(i+SIMDSIZE*5UL,j) );
3658 SIMDType xmm7( C.load(i+SIMDSIZE*6UL,j) );
3659 SIMDType xmm8( C.load(i+SIMDSIZE*7UL,j) );
3660
3661 for( size_t k=kbegin; k<kend; ++k ) {
3662 const SIMDType b1( set( B(k,j) ) );
3663 xmm1 -= A.load(i ,k) * b1;
3664 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3665 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3666 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3667 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3668 xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
3669 xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
3670 xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
3671 }
3672
3673 C.store( i , j, xmm1 );
3674 C.store( i+SIMDSIZE , j, xmm2 );
3675 C.store( i+SIMDSIZE*2UL, j, xmm3 );
3676 C.store( i+SIMDSIZE*3UL, j, xmm4 );
3677 C.store( i+SIMDSIZE*4UL, j, xmm5 );
3678 C.store( i+SIMDSIZE*5UL, j, xmm6 );
3679 C.store( i+SIMDSIZE*6UL, j, xmm7 );
3680 C.store( i+SIMDSIZE*7UL, j, xmm8 );
3681 }
3682 }
3683 }
3684
3685 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3686 {
3687 size_t j( 0UL );
3688
3689 for( ; (j+2UL) <= N; j+=2UL )
3690 {
3691 const size_t kbegin( ( IsLower_v<MT5> )
3692 ?( ( IsUpper_v<MT4> )
3693 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3694 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3695 :( IsUpper_v<MT4> ? i : 0UL ) );
3696 const size_t kend( ( IsUpper_v<MT5> )
3697 ?( ( IsLower_v<MT4> )
3698 ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3699 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3700 :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
3701
3702 SIMDType xmm1 ( C.load(i ,j ) );
3703 SIMDType xmm2 ( C.load(i+SIMDSIZE ,j ) );
3704 SIMDType xmm3 ( C.load(i+SIMDSIZE*2UL,j ) );
3705 SIMDType xmm4 ( C.load(i+SIMDSIZE*3UL,j ) );
3706 SIMDType xmm5 ( C.load(i+SIMDSIZE*4UL,j ) );
3707 SIMDType xmm6 ( C.load(i ,j+1UL) );
3708 SIMDType xmm7 ( C.load(i+SIMDSIZE ,j+1UL) );
3709 SIMDType xmm8 ( C.load(i+SIMDSIZE*2UL,j+1UL) );
3710 SIMDType xmm9 ( C.load(i+SIMDSIZE*3UL,j+1UL) );
3711 SIMDType xmm10( C.load(i+SIMDSIZE*4UL,j+1UL) );
3712
3713 for( size_t k=kbegin; k<kend; ++k ) {
3714 const SIMDType a1( A.load(i ,k) );
3715 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3716 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3717 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3718 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3719 const SIMDType b1( set( B(k,j ) ) );
3720 const SIMDType b2( set( B(k,j+1UL) ) );
3721 xmm1 -= a1 * b1;
3722 xmm2 -= a2 * b1;
3723 xmm3 -= a3 * b1;
3724 xmm4 -= a4 * b1;
3725 xmm5 -= a5 * b1;
3726 xmm6 -= a1 * b2;
3727 xmm7 -= a2 * b2;
3728 xmm8 -= a3 * b2;
3729 xmm9 -= a4 * b2;
3730 xmm10 -= a5 * b2;
3731 }
3732
3733 C.store( i , j , xmm1 );
3734 C.store( i+SIMDSIZE , j , xmm2 );
3735 C.store( i+SIMDSIZE*2UL, j , xmm3 );
3736 C.store( i+SIMDSIZE*3UL, j , xmm4 );
3737 C.store( i+SIMDSIZE*4UL, j , xmm5 );
3738 C.store( i , j+1UL, xmm6 );
3739 C.store( i+SIMDSIZE , j+1UL, xmm7 );
3740 C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3741 C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3742 C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3743 }
3744
3745 if( j < N )
3746 {
3747 const size_t kbegin( ( IsLower_v<MT5> )
3748 ?( ( IsUpper_v<MT4> )
3749 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3750 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3751 :( IsUpper_v<MT4> ? i : 0UL ) );
3752 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
3753
3754 SIMDType xmm1( C.load(i ,j) );
3755 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3756 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3757 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3758 SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
3759
3760 for( size_t k=kbegin; k<kend; ++k ) {
3761 const SIMDType b1( set( B(k,j) ) );
3762 xmm1 -= A.load(i ,k) * b1;
3763 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3764 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3765 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3766 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3767 }
3768
3769 C.store( i , j, xmm1 );
3770 C.store( i+SIMDSIZE , j, xmm2 );
3771 C.store( i+SIMDSIZE*2UL, j, xmm3 );
3772 C.store( i+SIMDSIZE*3UL, j, xmm4 );
3773 C.store( i+SIMDSIZE*4UL, j, xmm5 );
3774 }
3775 }
3776
3777 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3778 {
3779 size_t j( 0UL );
3780
3781 for( ; (j+2UL) <= N; j+=2UL )
3782 {
3783 const size_t kbegin( ( IsLower_v<MT5> )
3784 ?( ( IsUpper_v<MT4> )
3785 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3786 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3787 :( IsUpper_v<MT4> ? i : 0UL ) );
3788 const size_t kend( ( IsUpper_v<MT5> )
3789 ?( ( IsLower_v<MT4> )
3790 ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3791 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3792 :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
3793
3794 SIMDType xmm1( C.load(i ,j ) );
3795 SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
3796 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
3797 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j ) );
3798 SIMDType xmm5( C.load(i ,j+1UL) );
3799 SIMDType xmm6( C.load(i+SIMDSIZE ,j+1UL) );
3800 SIMDType xmm7( C.load(i+SIMDSIZE*2UL,j+1UL) );
3801 SIMDType xmm8( C.load(i+SIMDSIZE*3UL,j+1UL) );
3802
3803 for( size_t k=kbegin; k<kend; ++k ) {
3804 const SIMDType a1( A.load(i ,k) );
3805 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3806 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3807 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3808 const SIMDType b1( set( B(k,j ) ) );
3809 const SIMDType b2( set( B(k,j+1UL) ) );
3810 xmm1 -= a1 * b1;
3811 xmm2 -= a2 * b1;
3812 xmm3 -= a3 * b1;
3813 xmm4 -= a4 * b1;
3814 xmm5 -= a1 * b2;
3815 xmm6 -= a2 * b2;
3816 xmm7 -= a3 * b2;
3817 xmm8 -= a4 * b2;
3818 }
3819
3820 C.store( i , j , xmm1 );
3821 C.store( i+SIMDSIZE , j , xmm2 );
3822 C.store( i+SIMDSIZE*2UL, j , xmm3 );
3823 C.store( i+SIMDSIZE*3UL, j , xmm4 );
3824 C.store( i , j+1UL, xmm5 );
3825 C.store( i+SIMDSIZE , j+1UL, xmm6 );
3826 C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3827 C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3828 }
3829
3830 if( j < N )
3831 {
3832 const size_t kbegin( ( IsLower_v<MT5> )
3833 ?( ( IsUpper_v<MT4> )
3834 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3835 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3836 :( IsUpper_v<MT4> ? i : 0UL ) );
3837 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
3838
3839 SIMDType xmm1( C.load(i ,j) );
3840 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3841 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3842 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3843
3844 for( size_t k=kbegin; k<kend; ++k ) {
3845 const SIMDType b1( set( B(k,j) ) );
3846 xmm1 -= A.load(i ,k) * b1;
3847 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3848 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3849 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3850 }
3851
3852 C.store( i , j, xmm1 );
3853 C.store( i+SIMDSIZE , j, xmm2 );
3854 C.store( i+SIMDSIZE*2UL, j, xmm3 );
3855 C.store( i+SIMDSIZE*3UL, j, xmm4 );
3856 }
3857 }
3858
3859 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3860 {
3861 size_t j( 0UL );
3862
3863 for( ; (j+2UL) <= N; j+=2UL )
3864 {
3865 const size_t kbegin( ( IsLower_v<MT5> )
3866 ?( ( IsUpper_v<MT4> )
3867 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3868 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3869 :( IsUpper_v<MT4> ? i : 0UL ) );
3870 const size_t kend( ( IsUpper_v<MT5> )
3871 ?( ( IsLower_v<MT4> )
3872 ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3873 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3874 :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
3875
3876 SIMDType xmm1( C.load(i ,j ) );
3877 SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
3878 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
3879 SIMDType xmm4( C.load(i ,j+1UL) );
3880 SIMDType xmm5( C.load(i+SIMDSIZE ,j+1UL) );
3881 SIMDType xmm6( C.load(i+SIMDSIZE*2UL,j+1UL) );
3882
3883 for( size_t k=kbegin; k<kend; ++k ) {
3884 const SIMDType a1( A.load(i ,k) );
3885 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3886 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3887 const SIMDType b1( set( B(k,j ) ) );
3888 const SIMDType b2( set( B(k,j+1UL) ) );
3889 xmm1 -= a1 * b1;
3890 xmm2 -= a2 * b1;
3891 xmm3 -= a3 * b1;
3892 xmm4 -= a1 * b2;
3893 xmm5 -= a2 * b2;
3894 xmm6 -= a3 * b2;
3895 }
3896
3897 C.store( i , j , xmm1 );
3898 C.store( i+SIMDSIZE , j , xmm2 );
3899 C.store( i+SIMDSIZE*2UL, j , xmm3 );
3900 C.store( i , j+1UL, xmm4 );
3901 C.store( i+SIMDSIZE , j+1UL, xmm5 );
3902 C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
3903 }
3904
3905 if( j < N )
3906 {
3907 const size_t kbegin( ( IsLower_v<MT5> )
3908 ?( ( IsUpper_v<MT4> )
3909 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3910 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3911 :( IsUpper_v<MT4> ? i : 0UL ) );
3912 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
3913
3914 SIMDType xmm1( C.load(i ,j) );
3915 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3916 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3917
3918 for( size_t k=kbegin; k<kend; ++k ) {
3919 const SIMDType b1( set( B(k,j) ) );
3920 xmm1 -= A.load(i ,k) * b1;
3921 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3922 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3923 }
3924
3925 C.store( i , j, xmm1 );
3926 C.store( i+SIMDSIZE , j, xmm2 );
3927 C.store( i+SIMDSIZE*2UL, j, xmm3 );
3928 }
3929 }
3930
3931 for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3932 {
3933 const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
3934 size_t j( UPP ? i : 0UL );
3935
3936 for( ; (j+4UL) <= jend; j+=4UL )
3937 {
3938 const size_t kbegin( ( IsLower_v<MT5> )
3939 ?( ( IsUpper_v<MT4> )
3940 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3941 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3942 :( IsUpper_v<MT4> ? i : 0UL ) );
3943 const size_t kend( ( IsUpper_v<MT5> )
3944 ?( ( IsLower_v<MT4> )
3945 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
3946 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
3947 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
3948
3949 SIMDType xmm1( C.load(i ,j ) );
3950 SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
3951 SIMDType xmm3( C.load(i ,j+1UL) );
3952 SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
3953 SIMDType xmm5( C.load(i ,j+2UL) );
3954 SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
3955 SIMDType xmm7( C.load(i ,j+3UL) );
3956 SIMDType xmm8( C.load(i+SIMDSIZE,j+3UL) );
3957
3958 for( size_t k=kbegin; k<kend; ++k ) {
3959 const SIMDType a1( A.load(i ,k) );
3960 const SIMDType a2( A.load(i+SIMDSIZE,k) );
3961 const SIMDType b1( set( B(k,j ) ) );
3962 const SIMDType b2( set( B(k,j+1UL) ) );
3963 const SIMDType b3( set( B(k,j+2UL) ) );
3964 const SIMDType b4( set( B(k,j+3UL) ) );
3965 xmm1 -= a1 * b1;
3966 xmm2 -= a2 * b1;
3967 xmm3 -= a1 * b2;
3968 xmm4 -= a2 * b2;
3969 xmm5 -= a1 * b3;
3970 xmm6 -= a2 * b3;
3971 xmm7 -= a1 * b4;
3972 xmm8 -= a2 * b4;
3973 }
3974
3975 C.store( i , j , xmm1 );
3976 C.store( i+SIMDSIZE, j , xmm2 );
3977 C.store( i , j+1UL, xmm3 );
3978 C.store( i+SIMDSIZE, j+1UL, xmm4 );
3979 C.store( i , j+2UL, xmm5 );
3980 C.store( i+SIMDSIZE, j+2UL, xmm6 );
3981 C.store( i , j+3UL, xmm7 );
3982 C.store( i+SIMDSIZE, j+3UL, xmm8 );
3983 }
3984
3985 for( ; (j+3UL) <= jend; j+=3UL )
3986 {
3987 const size_t kbegin( ( IsLower_v<MT5> )
3988 ?( ( IsUpper_v<MT4> )
3989 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3990 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3991 :( IsUpper_v<MT4> ? i : 0UL ) );
3992 const size_t kend( ( IsUpper_v<MT5> )
3993 ?( ( IsLower_v<MT4> )
3994 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
3995 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
3996 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
3997
3998 SIMDType xmm1( C.load(i ,j ) );
3999 SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
4000 SIMDType xmm3( C.load(i ,j+1UL) );
4001 SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
4002 SIMDType xmm5( C.load(i ,j+2UL) );
4003 SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
4004
4005 for( size_t k=kbegin; k<kend; ++k ) {
4006 const SIMDType a1( A.load(i ,k) );
4007 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4008 const SIMDType b1( set( B(k,j ) ) );
4009 const SIMDType b2( set( B(k,j+1UL) ) );
4010 const SIMDType b3( set( B(k,j+2UL) ) );
4011 xmm1 -= a1 * b1;
4012 xmm2 -= a2 * b1;
4013 xmm3 -= a1 * b2;
4014 xmm4 -= a2 * b2;
4015 xmm5 -= a1 * b3;
4016 xmm6 -= a2 * b3;
4017 }
4018
4019 C.store( i , j , xmm1 );
4020 C.store( i+SIMDSIZE, j , xmm2 );
4021 C.store( i , j+1UL, xmm3 );
4022 C.store( i+SIMDSIZE, j+1UL, xmm4 );
4023 C.store( i , j+2UL, xmm5 );
4024 C.store( i+SIMDSIZE, j+2UL, xmm6 );
4025 }
4026
4027 for( ; (j+2UL) <= jend; j+=2UL )
4028 {
4029 const size_t kbegin( ( IsLower_v<MT5> )
4030 ?( ( IsUpper_v<MT4> )
4031 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4032 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4033 :( IsUpper_v<MT4> ? i : 0UL ) );
4034 const size_t kend( ( IsUpper_v<MT5> )
4035 ?( ( IsLower_v<MT4> )
4036 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4037 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4038 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
4039
4040 SIMDType xmm1( C.load(i ,j ) );
4041 SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
4042 SIMDType xmm3( C.load(i ,j+1UL) );
4043 SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
4044
4045 for( size_t k=kbegin; k<kend; ++k ) {
4046 const SIMDType a1( A.load(i ,k) );
4047 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4048 const SIMDType b1( set( B(k,j ) ) );
4049 const SIMDType b2( set( B(k,j+1UL) ) );
4050 xmm1 -= a1 * b1;
4051 xmm2 -= a2 * b1;
4052 xmm3 -= a1 * b2;
4053 xmm4 -= a2 * b2;
4054 }
4055
4056 C.store( i , j , xmm1 );
4057 C.store( i+SIMDSIZE, j , xmm2 );
4058 C.store( i , j+1UL, xmm3 );
4059 C.store( i+SIMDSIZE, j+1UL, xmm4 );
4060 }
4061
4062 if( j < jend )
4063 {
4064 const size_t kbegin( ( IsLower_v<MT5> )
4065 ?( ( IsUpper_v<MT4> )
4066 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4067 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4068 :( IsUpper_v<MT4> ? i : 0UL ) );
4069 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
4070
4071 SIMDType xmm1( C.load(i ,j) );
4072 SIMDType xmm2( C.load(i+SIMDSIZE,j) );
4073
4074 for( size_t k=kbegin; k<kend; ++k ) {
4075 const SIMDType b1( set( B(k,j) ) );
4076 xmm1 -= A.load(i ,k) * b1;
4077 xmm2 -= A.load(i+SIMDSIZE,k) * b1;
4078 }
4079
4080 C.store( i , j, xmm1 );
4081 C.store( i+SIMDSIZE, j, xmm2 );
4082 }
4083 }
4084
4085 for( ; i<ipos; i+=SIMDSIZE )
4086 {
4087 const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
4088 size_t j( UPP ? i : 0UL );
4089
4090 for( ; (j+4UL) <= jend; j+=4UL )
4091 {
4092 const size_t kbegin( ( IsLower_v<MT5> )
4093 ?( ( IsUpper_v<MT4> )
4094 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4095 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4096 :( IsUpper_v<MT4> ? i : 0UL ) );
4097 const size_t kend( ( IsUpper_v<MT5> )
4098 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
4099 :( K ) );
4100
4101 SIMDType xmm1( C.load(i,j ) );
4102 SIMDType xmm2( C.load(i,j+1UL) );
4103 SIMDType xmm3( C.load(i,j+2UL) );
4104 SIMDType xmm4( C.load(i,j+3UL) );
4105
4106 for( size_t k=kbegin; k<kend; ++k ) {
4107 const SIMDType a1( A.load(i,k) );
4108 xmm1 -= a1 * set( B(k,j ) );
4109 xmm2 -= a1 * set( B(k,j+1UL) );
4110 xmm3 -= a1 * set( B(k,j+2UL) );
4111 xmm4 -= a1 * set( B(k,j+3UL) );
4112 }
4113
4114 C.store( i, j , xmm1 );
4115 C.store( i, j+1UL, xmm2 );
4116 C.store( i, j+2UL, xmm3 );
4117 C.store( i, j+3UL, xmm4 );
4118 }
4119
4120 for( ; (j+3UL) <= jend; j+=3UL )
4121 {
4122 const size_t kbegin( ( IsLower_v<MT5> )
4123 ?( ( IsUpper_v<MT4> )
4124 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4125 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4126 :( IsUpper_v<MT4> ? i : 0UL ) );
4127 const size_t kend( ( IsUpper_v<MT5> )
4128 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
4129 :( K ) );
4130
4131 SIMDType xmm1( C.load(i,j ) );
4132 SIMDType xmm2( C.load(i,j+1UL) );
4133 SIMDType xmm3( C.load(i,j+2UL) );
4134
4135 for( size_t k=kbegin; k<kend; ++k ) {
4136 const SIMDType a1( A.load(i,k) );
4137 xmm1 -= a1 * set( B(k,j ) );
4138 xmm2 -= a1 * set( B(k,j+1UL) );
4139 xmm3 -= a1 * set( B(k,j+2UL) );
4140 }
4141
4142 C.store( i, j , xmm1 );
4143 C.store( i, j+1UL, xmm2 );
4144 C.store( i, j+2UL, xmm3 );
4145 }
4146
4147 for( ; (j+2UL) <= jend; j+=2UL )
4148 {
4149 const size_t kbegin( ( IsLower_v<MT5> )
4150 ?( ( IsUpper_v<MT4> )
4151 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4152 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4153 :( IsUpper_v<MT4> ? i : 0UL ) );
4154 const size_t kend( ( IsUpper_v<MT5> )
4155 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4156 :( K ) );
4157
4158 SIMDType xmm1( C.load(i,j ) );
4159 SIMDType xmm2( C.load(i,j+1UL) );
4160
4161 for( size_t k=kbegin; k<kend; ++k ) {
4162 const SIMDType a1( A.load(i,k) );
4163 xmm1 -= a1 * set( B(k,j ) );
4164 xmm2 -= a1 * set( B(k,j+1UL) );
4165 }
4166
4167 C.store( i, j , xmm1 );
4168 C.store( i, j+1UL, xmm2 );
4169 }
4170
4171 if( j < jend )
4172 {
4173 const size_t kbegin( ( IsLower_v<MT5> )
4174 ?( ( IsUpper_v<MT4> )
4175 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4176 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4177 :( IsUpper_v<MT4> ? i : 0UL ) );
4178
4179 SIMDType xmm1( C.load(i,j) );
4180
4181 for( size_t k=kbegin; k<K; ++k ) {
4182 xmm1 -= A.load(i,k) * set( B(k,j) );
4183 }
4184
4185 C.store( i, j, xmm1 );
4186 }
4187 }
4188
4189 for( ; remainder && i<M; ++i )
4190 {
4191 const size_t jend( LOW ? i+1UL : N );
4192 size_t j( UPP ? i : 0UL );
4193
4194 for( ; (j+2UL) <= jend; j+=2UL )
4195 {
4196 const size_t kbegin( ( IsLower_v<MT5> )
4197 ?( ( IsUpper_v<MT4> )
4198 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4199 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4200 :( IsUpper_v<MT4> ? i : 0UL ) );
4201 const size_t kend( ( IsUpper_v<MT5> )
4202 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4203 :( K ) );
4204
4205 ElementType value1( C(i,j ) );
4206 ElementType value2( C(i,j+1UL) );
4207
4208 for( size_t k=kbegin; k<kend; ++k ) {
4209 value1 -= A(i,k) * B(k,j );
4210 value2 -= A(i,k) * B(k,j+1UL);
4211 }
4212
4213 C(i,j ) = value1;
4214 C(i,j+1UL) = value2;
4215 }
4216
4217 if( j < jend )
4218 {
4219 const size_t kbegin( ( IsLower_v<MT5> )
4220 ?( ( IsUpper_v<MT4> )
4221 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4222 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4223 :( IsUpper_v<MT4> ? i : 0UL ) );
4224
4225 ElementType value( C(i,j) );
4226
4227 for( size_t k=kbegin; k<K; ++k ) {
4228 value -= A(i,k) * B(k,j);
4229 }
4230
4231 C(i,j) = value;
4232 }
4233 }
4234 }
4236 //**********************************************************************************************
4237
4238 //**Default subtraction assignment to dense matrices (large matrices)***************************
4252 template< typename MT3 // Type of the left-hand side target matrix
4253 , typename MT4 // Type of the left-hand side matrix operand
4254 , typename MT5 > // Type of the right-hand side matrix operand
4255 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4256 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4257 {
4258 selectDefaultSubAssignKernel( C, A, B );
4259 }
4261 //**********************************************************************************************
4262
4263 //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
4278 template< typename MT3 // Type of the left-hand side target matrix
4279 , typename MT4 // Type of the left-hand side matrix operand
4280 , typename MT5 > // Type of the right-hand side matrix operand
4281 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4282 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4283 {
4284 if( LOW )
4285 lmmm( C, A, B, ElementType(-1), ElementType(1) );
4286 else if( UPP )
4287 ummm( C, A, B, ElementType(-1), ElementType(1) );
4288 else
4289 mmm( C, A, B, ElementType(-1), ElementType(1) );
4290 }
4292 //**********************************************************************************************
4293
4294 //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4308 template< typename MT3 // Type of the left-hand side target matrix
4309 , typename MT4 // Type of the left-hand side matrix operand
4310 , typename MT5 > // Type of the right-hand side matrix operand
4311 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4312 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4313 {
4314 selectLargeSubAssignKernel( C, A, B );
4315 }
4317 //**********************************************************************************************
4318
4319 //**BLAS-based subraction assignment to dense matrices******************************************
4320#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4334 template< typename MT3 // Type of the left-hand side target matrix
4335 , typename MT4 // Type of the left-hand side matrix operand
4336 , typename MT5 > // Type of the right-hand side matrix operand
4337 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4338 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4339 {
4340 using ET = ElementType_t<MT3>;
4341
4342 if( IsTriangular_v<MT4> ) {
4343 ResultType_t<MT3> tmp( serial( B ) );
4344 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4345 subAssign( C, tmp );
4346 }
4347 else if( IsTriangular_v<MT5> ) {
4348 ResultType_t<MT3> tmp( serial( A ) );
4349 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4350 subAssign( C, tmp );
4351 }
4352 else {
4353 gemm( C, A, B, ET(-1), ET(1) );
4354 }
4355 }
4357#endif
4358 //**********************************************************************************************
4359
4360 //**Restructuring subtraction assignment to row-major matrices**********************************
4376 template< typename MT > // Type of the target matrix
4377 friend inline auto subAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4378 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4379 {
4381
4383
4384 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4385 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4386
4387 const ForwardFunctor fwd;
4388
4389 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4390 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4391
4392 subAssign( *lhs, fwd( A * B ) );
4393 }
4395 //**********************************************************************************************
4396
4397 //**Subtraction assignment to sparse matrices***************************************************
4398 // No special implementation for the subtraction assignment to sparse matrices.
4399 //**********************************************************************************************
4400
4401 //**Schur product assignment to dense matrices**************************************************
4414 template< typename MT // Type of the target dense matrix
4415 , bool SO > // Storage order of the target dense matrix
4416 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4417 {
4419
4423
4424 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4425 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4426
4427 const ResultType tmp( serial( rhs ) );
4428 schurAssign( *lhs, tmp );
4429 }
4431 //**********************************************************************************************
4432
4433 //**Multiplication assignment to dense matrices*************************************************
4434 // No special implementation for the multiplication assignment to dense matrices.
4435 //**********************************************************************************************
4436
4437 //**Multiplication assignment to sparse matrices************************************************
4438 // No special implementation for the multiplication assignment to sparse matrices.
4439 //**********************************************************************************************
4440
4441 //**SMP assignment to dense matrices************************************************************
4457 template< typename MT // Type of the target dense matrix
4458 , bool SO > // Storage order of the target dense matrix
4459 friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4460 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4461 {
4463
4464 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4465 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4466
4467 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
4468 return;
4469 }
4470 else if( rhs.lhs_.columns() == 0UL ) {
4471 reset( *lhs );
4472 return;
4473 }
4474
4475 LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4476 RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4477
4478 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4479 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4480 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4481 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4482 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
4483 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
4484
4485 smpAssign( *lhs, A * B );
4486 }
4488 //**********************************************************************************************
4489
4490 //**SMP assignment to sparse matrices***********************************************************
4506 template< typename MT // Type of the target sparse matrix
4507 , bool SO > // Storage order of the target sparse matrix
4508 friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4509 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4510 {
4512
4513 using TmpType = If_t< SO, ResultType, OppositeType >;
4514
4521
4522 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4523 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4524
4525 const ForwardFunctor fwd;
4526
4527 const TmpType tmp( rhs );
4528 smpAssign( *lhs, fwd( tmp ) );
4529 }
4531 //**********************************************************************************************
4532
4533 //**Restructuring SMP assignment to row-major matrices******************************************
4548 template< typename MT > // Type of the target matrix
4549 friend inline auto smpAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4550 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4551 {
4553
4555
4556 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4557 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4558
4559 const ForwardFunctor fwd;
4560
4561 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4562 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4563
4564 smpAssign( *lhs, fwd( A * B ) );
4565 }
4567 //**********************************************************************************************
4568
4569 //**SMP addition assignment to dense matrices***************************************************
4585 template< typename MT // Type of the target dense matrix
4586 , bool SO > // Storage order of the target dense matrix
4587 friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4588 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4589 {
4591
4592 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4593 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4594
4595 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4596 return;
4597 }
4598
4599 LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4600 RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4601
4602 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4603 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4604 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4605 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4606 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
4607 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
4608
4609 smpAddAssign( *lhs, A * B );
4610 }
4612 //**********************************************************************************************
4613
4614 //**Restructuring SMP addition assignment to row-major matrices*********************************
4630 template< typename MT > // Type of the target matrix
4631 friend inline auto smpAddAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4632 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4633 {
4635
4637
4638 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4639 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4640
4641 const ForwardFunctor fwd;
4642
4643 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4644 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4645
4646 smpAddAssign( *lhs, fwd( A * B ) );
4647 }
4649 //**********************************************************************************************
4650
4651 //**SMP addition assignment to sparse matrices**************************************************
4652 // No special implementation for the SMP addition assignment to sparse matrices.
4653 //**********************************************************************************************
4654
4655 //**SMP subtraction assignment to dense matrices************************************************
4671 template< typename MT // Type of the target dense matrix
4672 , bool SO > // Storage order of the target dense matrix
4673 friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4674 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4675 {
4677
4678 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4679 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4680
4681 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4682 return;
4683 }
4684
4685 LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4686 RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4687
4688 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4689 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4690 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4691 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4692 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
4693 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
4694
4695 smpSubAssign( *lhs, A * B );
4696 }
4698 //**********************************************************************************************
4699
4700 //**Restructuring SMP subtraction assignment to row-major matrices******************************
4716 template< typename MT > // Type of the target matrix
4717 friend inline auto smpSubAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4718 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4719 {
4721
4723
4724 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4725 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4726
4727 const ForwardFunctor fwd;
4728
4729 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4730 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4731
4732 smpSubAssign( *lhs, fwd( A * B ) );
4733 }
4735 //**********************************************************************************************
4736
4737 //**SMP subtraction assignment to sparse matrices***********************************************
4738 // No special implementation for the SMP subtraction assignment to sparse matrices.
4739 //**********************************************************************************************
4740
4741 //**SMP Schur product assignment to dense matrices**********************************************
4755 template< typename MT // Type of the target dense matrix
4756 , bool SO > // Storage order of the target dense matrix
4757 friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4758 {
4760
4764
4765 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4766 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4767
4768 const ResultType tmp( rhs );
4769 smpSchurAssign( *lhs, tmp );
4770 }
4772 //**********************************************************************************************
4773
4774 //**SMP Schur product assignment to sparse matrices*********************************************
4775 // No special implementation for the SMP Schur product assignment to sparse matrices.
4776 //**********************************************************************************************
4777
4778 //**SMP multiplication assignment to dense matrices*********************************************
4779 // No special implementation for the SMP multiplication assignment to dense matrices.
4780 //**********************************************************************************************
4781
4782 //**SMP multiplication assignment to sparse matrices********************************************
4783 // No special implementation for the SMP multiplication assignment to sparse matrices.
4784 //**********************************************************************************************
4785
4786 //**Compile time checks*************************************************************************
4794 //**********************************************************************************************
4795};
4796//*************************************************************************************************
4797
4798
4799
4800
4801//=================================================================================================
4802//
4803// DMATSCALARMULTEXPR SPECIALIZATION
4804//
4805//=================================================================================================
4806
4807//*************************************************************************************************
4815template< typename MT1 // Type of the left-hand side dense matrix
4816 , typename MT2 // Type of the right-hand side dense matrix
4817 , bool SF // Symmetry flag
4818 , bool HF // Hermitian flag
4819 , bool LF // Lower flag
4820 , bool UF // Upper flag
4821 , typename ST > // Type of the right-hand side scalar value
4822class DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
4823 : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
4824 , private Computation
4825{
4826 private:
4827 //**Type definitions****************************************************************************
4829 using MMM = TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4830
4831 using RES = ResultType_t<MMM>;
4832 using RT1 = ResultType_t<MT1>;
4833 using RT2 = ResultType_t<MT2>;
4834 using ET1 = ElementType_t<RT1>;
4835 using ET2 = ElementType_t<RT2>;
4836 using CT1 = CompositeType_t<MT1>;
4837 using CT2 = CompositeType_t<MT2>;
4838 //**********************************************************************************************
4839
4840 //**********************************************************************************************
4842 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4843 //**********************************************************************************************
4844
4845 //**********************************************************************************************
4847 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4848 //**********************************************************************************************
4849
4850 //**********************************************************************************************
4851 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
4852 static constexpr bool HERM = ( HF && !( LF || UF ) );
4853 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4854 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4855 //**********************************************************************************************
4856
4857 //**********************************************************************************************
4859
4863 template< typename T1, typename T2, typename T3 >
4864 static constexpr bool CanExploitSymmetry_v =
4865 ( IsRowMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
4866 //**********************************************************************************************
4867
4868 //**********************************************************************************************
4870
4873 template< typename T1, typename T2, typename T3 >
4874 static constexpr bool IsEvaluationRequired_v =
4875 ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
4876 //**********************************************************************************************
4877
4878 //**********************************************************************************************
4880
4882 template< typename T1, typename T2, typename T3, typename T4 >
4883 static constexpr bool UseBlasKernel_v =
4884 ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4885 !SYM && !HERM && !LOW && !UPP &&
4886 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4887 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4888 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4889 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4890 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4891 IsBLASCompatible_v< ElementType_t<T1> > &&
4892 IsBLASCompatible_v< ElementType_t<T2> > &&
4893 IsBLASCompatible_v< ElementType_t<T3> > &&
4894 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4895 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4896 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4897 //**********************************************************************************************
4898
4899 //**********************************************************************************************
4901
4903 template< typename T1, typename T2, typename T3, typename T4 >
4904 static constexpr bool UseVectorizedDefaultKernel_v =
4905 ( useOptimizedKernels &&
4906 !IsDiagonal_v<T2> &&
4907 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4908 IsSIMDCombinable_v< ElementType_t<T1>
4909 , ElementType_t<T2>
4910 , ElementType_t<T3>
4911 , T4 > &&
4912 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T2> > &&
4913 HasSIMDMult_v< ElementType_t<T3>, ElementType_t<T3> > );
4914 //**********************************************************************************************
4915
4916 //**********************************************************************************************
4918
4920 using ForwardFunctor = If_t< HERM
4921 , DeclHerm
4922 , If_t< SYM
4923 , DeclSym
4924 , If_t< LOW
4925 , If_t< UPP
4926 , DeclDiag
4927 , DeclLow >
4928 , If_t< UPP
4929 , DeclUpp
4930 , Noop > > > >;
4931 //**********************************************************************************************
4932
4933 public:
4934 //**Type definitions****************************************************************************
4936 using This = DMatScalarMultExpr<MMM,ST,true>;
4937
4939 using BaseType = MatScalarMultExpr< DenseMatrix<This,true> >;
4940
4942 using ResultType = typename If_t< HERM
4943 , DeclHermTrait< MultTrait_t<RES,ST> >
4944 , If_t< SYM
4945 , DeclSymTrait< MultTrait_t<RES,ST> >
4946 , If_t< LOW
4947 , If_t< UPP
4948 , DeclDiagTrait< MultTrait_t<RES,ST> >
4949 , DeclLowTrait< MultTrait_t<RES,ST> > >
4950 , If_t< UPP
4951 , DeclUppTrait< MultTrait_t<RES,ST> >
4952 , MultTrait<RES,ST> > > > >::Type;
4953
4954 using OppositeType = OppositeType_t<ResultType>;
4955 using TransposeType = TransposeType_t<ResultType>;
4956 using ElementType = ElementType_t<ResultType>;
4957 using SIMDType = SIMDTrait_t<ElementType>;
4958 using ReturnType = const ElementType;
4959 using CompositeType = const ResultType;
4960
4962 using LeftOperand = const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4963
4965 using RightOperand = ST;
4966
4968 using LT = If_t< evaluateLeft, const RT1, CT1 >;
4969
4971 using RT = If_t< evaluateRight, const RT2, CT2 >;
4972 //**********************************************************************************************
4973
4974 //**Compilation flags***************************************************************************
4976 static constexpr bool simdEnabled =
4977 ( !IsDiagonal_v<MT1> &&
4978 MT1::simdEnabled && MT2::simdEnabled &&
4979 IsSIMDCombinable_v<ET1,ET2,ST> &&
4980 HasSIMDAdd_v<ET1,ET2> &&
4981 HasSIMDMult_v<ET1,ET2> );
4982
4984 static constexpr bool smpAssignable =
4985 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
4986 //**********************************************************************************************
4987
4988 //**SIMD properties*****************************************************************************
4990 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
4991 //**********************************************************************************************
4992
4993 //**Constructor*********************************************************************************
4999 inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
5000 : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
5001 , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
5002 {}
5003 //**********************************************************************************************
5004
5005 //**Access operator*****************************************************************************
5012 inline ReturnType operator()( size_t i, size_t j ) const {
5013 BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
5014 BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
5015 return matrix_(i,j) * scalar_;
5016 }
5017 //**********************************************************************************************
5018
5019 //**At function*********************************************************************************
5027 inline ReturnType at( size_t i, size_t j ) const {
5028 if( i >= matrix_.rows() ) {
5029 BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
5030 }
5031 if( j >= matrix_.columns() ) {
5032 BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
5033 }
5034 return (*this)(i,j);
5035 }
5036 //**********************************************************************************************
5037
5038 //**Rows function*******************************************************************************
5043 inline size_t rows() const {
5044 return matrix_.rows();
5045 }
5046 //**********************************************************************************************
5047
5048 //**Columns function****************************************************************************
5053 inline size_t columns() const {
5054 return matrix_.columns();
5055 }
5056 //**********************************************************************************************
5057
5058 //**Left operand access*************************************************************************
5063 inline LeftOperand leftOperand() const {
5064 return matrix_;
5065 }
5066 //**********************************************************************************************
5067
5068 //**Right operand access************************************************************************
5073 inline RightOperand rightOperand() const {
5074 return scalar_;
5075 }
5076 //**********************************************************************************************
5077
5078 //**********************************************************************************************
5084 template< typename T >
5085 inline bool canAlias( const T* alias ) const {
5086 return matrix_.canAlias( alias );
5087 }
5088 //**********************************************************************************************
5089
5090 //**********************************************************************************************
5096 template< typename T >
5097 inline bool isAliased( const T* alias ) const {
5098 return matrix_.isAliased( alias );
5099 }
5100 //**********************************************************************************************
5101
5102 //**********************************************************************************************
5107 inline bool isAligned() const {
5108 return matrix_.isAligned();
5109 }
5110 //**********************************************************************************************
5111
5112 //**********************************************************************************************
5117 inline bool canSMPAssign() const noexcept {
5118 return ( !BLAZE_BLAS_MODE ||
5119 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
5121 ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
5122 ( rows() * columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
5123 }
5124 //**********************************************************************************************
5125
5126 private:
5127 //**Member variables****************************************************************************
5130 //**********************************************************************************************
5131
5132 //**Assignment to dense matrices****************************************************************
5144 template< typename MT // Type of the target dense matrix
5145 , bool SO > // Storage order of the target dense matrix
5146 friend inline auto assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5147 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
5148 {
5150
5151 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
5152 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
5153
5154 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
5155 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
5156
5157 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
5158 return;
5159 }
5160 else if( left.columns() == 0UL ) {
5161 reset( *lhs );
5162 return;
5163 }
5164
5165 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
5166 RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
5167
5168 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5169 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
5170 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
5171 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
5172 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
5173 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
5174
5175 DMatScalarMultExpr::selectAssignKernel( *lhs, A, B, rhs.scalar_ );
5176 }
5177 //**********************************************************************************************
5178
5179 //**Assignment to dense matrices (kernel selection)*********************************************
5190 template< typename MT3 // Type of the left-hand side target matrix
5191 , typename MT4 // Type of the left-hand side matrix operand
5192 , typename MT5 // Type of the right-hand side matrix operand
5193 , typename ST2 > // Type of the scalar value
5194 static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5195 {
5196 if( ( IsDiagonal_v<MT4> ) ||
5197 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
5198 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
5199 selectSmallAssignKernel( C, A, B, scalar );
5200 else
5201 selectBlasAssignKernel( C, A, B, scalar );
5202 }
5203 //**********************************************************************************************
5204
5205 //**Default assignment to dense matrices (general/general)**************************************
5219 template< typename MT3 // Type of the left-hand side target matrix
5220 , typename MT4 // Type of the left-hand side matrix operand
5221 , typename MT5 // Type of the right-hand side matrix operand
5222 , typename ST2 > // Type of the scalar value
5223 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5224 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5225 {
5226 const size_t M( A.rows() );
5227 const size_t N( B.columns() );
5228 const size_t K( A.columns() );
5229
5230 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5231
5232 for( size_t j=0UL; j<N; ++j )
5233 {
5234 const size_t kbegin( ( IsLower_v<MT5> )
5235 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5236 :( 0UL ) );
5237 const size_t kend( ( IsUpper_v<MT5> )
5238 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5239 :( K ) );
5240 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
5241
5242 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
5243 for( size_t i=0UL; i<M; ++i ) {
5244 reset( C(i,j) );
5245 }
5246 continue;
5247 }
5248
5249 {
5250 const size_t ibegin( ( IsLower_v<MT4> )
5251 ?( ( IsStrictlyLower_v<MT4> )
5252 ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
5253 :( LOW ? max(j,kbegin) : kbegin ) )
5254 :( LOW ? j : 0UL ) );
5255 const size_t iend( ( IsUpper_v<MT4> )
5256 ?( ( IsStrictlyUpper_v<MT4> )
5257 ?( UPP ? min(j+1UL,kbegin) : kbegin )
5258 :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
5259 :( UPP ? j+1UL : M ) );
5260
5261 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
5262 for( size_t i=0UL; i<ibegin; ++i ) {
5263 reset( C(i,j) );
5264 }
5265 }
5266 else if( IsStrictlyLower_v<MT4> ) {
5267 reset( C(0UL,j) );
5268 }
5269 for( size_t i=ibegin; i<iend; ++i ) {
5270 C(i,j) = A(i,kbegin) * B(kbegin,j);
5271 }
5272 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
5273 for( size_t i=iend; i<M; ++i ) {
5274 reset( C(i,j) );
5275 }
5276 }
5277 else if( IsStrictlyUpper_v<MT4> ) {
5278 reset( C(M-1UL,j) );
5279 }
5280 }
5281
5282 for( size_t k=kbegin+1UL; k<kend; ++k )
5283 {
5284 const size_t ibegin( ( IsLower_v<MT4> )
5285 ?( ( IsStrictlyLower_v<MT4> )
5286 ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
5287 :( SYM || HERM || LOW ? max( j, k ) : k ) )
5288 :( SYM || HERM || LOW ? j : 0UL ) );
5289 const size_t iend( ( IsUpper_v<MT4> )
5290 ?( ( IsStrictlyUpper_v<MT4> )
5291 ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
5292 :( UPP ? min(j+1UL,k) : k ) )
5293 :( UPP ? j+1UL : M ) );
5294
5295 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
5296 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5297
5298 for( size_t i=ibegin; i<iend; ++i ) {
5299 C(i,j) += A(i,k) * B(k,j);
5300 }
5301 if( IsUpper_v<MT4> ) {
5302 C(iend,j) = A(iend,k) * B(k,j);
5303 }
5304 }
5305
5306 {
5307 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
5308 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? j+1UL : j )
5309 :( ( SYM || HERM || LOW )?( j ):( 0UL ) ) );
5310 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
5311 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? j : j+1UL )
5312 :( UPP ? j+1UL : M ) );
5313
5314 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
5315 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5316
5317 for( size_t i=ibegin; i<iend; ++i ) {
5318 C(i,j) *= scalar;
5319 }
5320 }
5321 }
5322
5323 if( SYM || HERM ) {
5324 for( size_t j=1UL; j<N; ++j ) {
5325 for( size_t i=0UL; i<j; ++i ) {
5326 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5327 }
5328 }
5329 }
5330 }
5331 //**********************************************************************************************
5332
5333 //**Default assignment to dense matrices (general/diagonal)*************************************
5347 template< typename MT3 // Type of the left-hand side target matrix
5348 , typename MT4 // Type of the left-hand side matrix operand
5349 , typename MT5 // Type of the right-hand side matrix operand
5350 , typename ST2 > // Type of the scalar value
5351 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5352 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5353 {
5355
5356 const size_t M( A.rows() );
5357 const size_t N( B.columns() );
5358
5359 for( size_t j=0UL; j<N; ++j )
5360 {
5361 const size_t ibegin( ( IsLower_v<MT4> )
5362 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
5363 :( 0UL ) );
5364 const size_t iend( ( IsUpper_v<MT4> )
5365 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
5366 :( M ) );
5367 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5368
5369 if( IsLower_v<MT4> ) {
5370 for( size_t i=0UL; i<ibegin; ++i ) {
5371 reset( C(i,j) );
5372 }
5373 }
5374 for( size_t i=ibegin; i<iend; ++i ) {
5375 C(i,j) = A(i,j) * B(j,j) * scalar;
5376 }
5377 if( IsUpper_v<MT4> ) {
5378 for( size_t i=iend; i<M; ++i ) {
5379 reset( C(i,j) );
5380 }
5381 }
5382 }
5383 }
5384 //**********************************************************************************************
5385
5386 //**Default assignment to dense matrices (diagonal/general)*************************************
5400 template< typename MT3 // Type of the left-hand side target matrix
5401 , typename MT4 // Type of the left-hand side matrix operand
5402 , typename MT5 // Type of the right-hand side matrix operand
5403 , typename ST2 > // Type of the scalar value
5404 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5405 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5406 {
5408
5409 const size_t M( A.rows() );
5410 const size_t N( B.columns() );
5411
5412 for( size_t j=0UL; j<N; ++j )
5413 {
5414 const size_t ibegin( ( IsLower_v<MT5> )
5415 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5416 :( 0UL ) );
5417 const size_t iend( ( IsUpper_v<MT5> )
5418 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5419 :( M ) );
5420 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5421
5422 if( IsLower_v<MT4> ) {
5423 for( size_t i=0UL; i<ibegin; ++i ) {
5424 reset( C(i,j) );
5425 }
5426 }
5427 for( size_t i=ibegin; i<iend; ++i ) {
5428 C(i,j) = A(i,i) * B(i,j) * scalar;
5429 }
5430 if( IsUpper_v<MT4> ) {
5431 for( size_t i=iend; i<M; ++i ) {
5432 reset( C(i,j) );
5433 }
5434 }
5435 }
5436 }
5437 //**********************************************************************************************
5438
5439 //**Default assignment to dense matrices (diagonal/diagonal)************************************
5453 template< typename MT3 // Type of the left-hand side target matrix
5454 , typename MT4 // Type of the left-hand side matrix operand
5455 , typename MT5 // Type of the right-hand side matrix operand
5456 , typename ST2 > // Type of the scalar value
5457 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5458 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5459 {
5461
5462 reset( C );
5463
5464 for( size_t i=0UL; i<A.rows(); ++i ) {
5465 C(i,i) = A(i,i) * B(i,i) * scalar;
5466 }
5467 }
5468 //**********************************************************************************************
5469
5470 //**Default assignment to dense matrices (small matrices)***************************************
5484 template< typename MT3 // Type of the left-hand side target matrix
5485 , typename MT4 // Type of the left-hand side matrix operand
5486 , typename MT5 // Type of the right-hand side matrix operand
5487 , typename ST2 > // Type of the scalar value
5488 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5489 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5490 {
5491 selectDefaultAssignKernel( C, A, B, scalar );
5492 }
5493 //**********************************************************************************************
5494
5495 //**Vectorized default assignment to row-major dense matrices (small matrices)******************
5510 template< typename MT3 // Type of the left-hand side target matrix
5511 , typename MT4 // Type of the left-hand side matrix operand
5512 , typename MT5 // Type of the right-hand side matrix operand
5513 , typename ST2 > // Type of the scalar value
5514 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5515 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5516 {
5521
5522 const ForwardFunctor fwd;
5523
5524 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
5525 const OppositeType_t<MT5> tmp( serial( B ) );
5526 assign( C, fwd( A * tmp ) * scalar );
5527 }
5528 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
5529 const OppositeType_t<MT4> tmp( serial( A ) );
5530 assign( C, fwd( tmp * B ) * scalar );
5531 }
5532 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5533 const OppositeType_t<MT5> tmp( serial( B ) );
5534 assign( C, fwd( A * tmp ) * scalar );
5535 }
5536 else {
5537 const OppositeType_t<MT4> tmp( serial( A ) );
5538 assign( C, fwd( tmp * B ) * scalar );
5539 }
5540 }
5541 //**********************************************************************************************
5542
5543 //**Vectorized default assignment to column-major dense matrices (small matrices)***************
5558 template< typename MT3 // Type of the left-hand side target matrix
5559 , typename MT4 // Type of the left-hand side matrix operand
5560 , typename MT5 // Type of the right-hand side matrix operand
5561 , typename ST2 > // Type of the scalar value
5562 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5563 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5564 {
5565 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
5566
5567 const size_t M( A.rows() );
5568 const size_t N( B.columns() );
5569 const size_t K( A.columns() );
5570
5571 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5572
5573 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
5574 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
5575
5576 const SIMDType factor( set( scalar ) );
5577
5578 size_t i( 0UL );
5579
5580 if( IsIntegral_v<ElementType> )
5581 {
5582 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5583 for( size_t j=0UL; j<N; ++j )
5584 {
5585 const size_t kbegin( ( IsLower_v<MT5> )
5586 ?( ( IsUpper_v<MT4> )
5587 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5588 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5589 :( IsUpper_v<MT4> ? i : 0UL ) );
5590 const size_t kend( ( IsUpper_v<MT5> )
5591 ?( ( IsLower_v<MT4> )
5592 ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
5593 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
5594 :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
5595
5596 size_t k( kbegin );
5597
5598 if( k < kend )
5599 {
5600 SIMDType b1( set( B(k,j) ) );
5601 SIMDType xmm1( A.load(i ,k) * b1 );
5602 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
5603 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
5604 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
5605 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
5606 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,k) * b1 );
5607 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,k) * b1 );
5608 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,k) * b1 );
5609
5610 for( ++k; k<kend; ++k ) {
5611 b1 = set( B(k,j) );
5612 xmm1 += A.load(i ,k) * b1;
5613 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5614 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5615 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5616 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5617 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
5618 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
5619 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
5620 }
5621
5622 C.store( i , j, xmm1 * factor );
5623 C.store( i+SIMDSIZE , j, xmm2 * factor );
5624 C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5625 C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5626 C.store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5627 C.store( i+SIMDSIZE*5UL, j, xmm6 * factor );
5628 C.store( i+SIMDSIZE*6UL, j, xmm7 * factor );
5629 C.store( i+SIMDSIZE*7UL, j, xmm8 * factor );
5630 }
5631 else
5632 {
5633 const SIMDType zero;
5634 C.store( i , j, zero );
5635 C.store( i+SIMDSIZE , j, zero );
5636 C.store( i+SIMDSIZE*2UL, j, zero );
5637 C.store( i+SIMDSIZE*3UL, j, zero );
5638 C.store( i+SIMDSIZE*4UL, j, zero );
5639 C.store( i+SIMDSIZE*5UL, j, zero );
5640 C.store( i+SIMDSIZE*6UL, j, zero );
5641 C.store( i+SIMDSIZE*7UL, j, zero );
5642 }
5643 }
5644 }
5645 }
5646
5647 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5648 {
5649 size_t j( 0UL );
5650
5651 for( ; (j+2UL) <= N; j+=2UL )
5652 {
5653 const size_t kbegin( ( IsLower_v<MT5> )
5654 ?( ( IsUpper_v<MT4> )
5655 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5656 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5657 :( IsUpper_v<MT4> ? i : 0UL ) );
5658 const size_t kend( ( IsUpper_v<MT5> )
5659 ?( ( IsLower_v<MT4> )
5660 ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5661 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5662 :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
5663
5664 size_t k( kbegin );
5665
5666 if( k < kend )
5667 {
5668 SIMDType a1( A.load(i ,k) );
5669 SIMDType a2( A.load(i+SIMDSIZE ,k) );
5670 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5671 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5672 SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5673 SIMDType b1( set( B(k,j ) ) );
5674 SIMDType b2( set( B(k,j+1UL) ) );
5675 SIMDType xmm1 ( a1 * b1 );
5676 SIMDType xmm2 ( a2 * b1 );
5677 SIMDType xmm3 ( a3 * b1 );
5678 SIMDType xmm4 ( a4 * b1 );
5679 SIMDType xmm5 ( a5 * b1 );
5680 SIMDType xmm6 ( a1 * b2 );
5681 SIMDType xmm7 ( a2 * b2 );
5682 SIMDType xmm8 ( a3 * b2 );
5683 SIMDType xmm9 ( a4 * b2 );
5684 SIMDType xmm10( a5 * b2 );
5685
5686 for( ++k; k<kend; ++k ) {
5687 a1 = A.load(i ,k);
5688 a2 = A.load(i+SIMDSIZE ,k);
5689 a3 = A.load(i+SIMDSIZE*2UL,k);
5690 a4 = A.load(i+SIMDSIZE*3UL,k);
5691 a5 = A.load(i+SIMDSIZE*4UL,k);
5692 b1 = set( B(k,j ) );
5693 b2 = set( B(k,j+1UL) );
5694 xmm1 += a1 * b1;
5695 xmm2 += a2 * b1;
5696 xmm3 += a3 * b1;
5697 xmm4 += a4 * b1;
5698 xmm5 += a5 * b1;
5699 xmm6 += a1 * b2;
5700 xmm7 += a2 * b2;
5701 xmm8 += a3 * b2;
5702 xmm9 += a4 * b2;
5703 xmm10 += a5 * b2;
5704 }
5705
5706 C.store( i , j , xmm1 * factor );
5707 C.store( i+SIMDSIZE , j , xmm2 * factor );
5708 C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5709 C.store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5710 C.store( i+SIMDSIZE*4UL, j , xmm5 * factor );
5711 C.store( i , j+1UL, xmm6 * factor );
5712 C.store( i+SIMDSIZE , j+1UL, xmm7 * factor );
5713 C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
5714 C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
5715 C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
5716 }
5717 else
5718 {
5719 const SIMDType zero;
5720 C.store( i , j , zero );
5721 C.store( i+SIMDSIZE , j , zero );
5722 C.store( i+SIMDSIZE*2UL, j , zero );
5723 C.store( i+SIMDSIZE*3UL, j , zero );
5724 C.store( i+SIMDSIZE*4UL, j , zero );
5725 C.store( i , j+1UL, zero );
5726 C.store( i+SIMDSIZE , j+1UL, zero );
5727 C.store( i+SIMDSIZE*2UL, j+1UL, zero );
5728 C.store( i+SIMDSIZE*3UL, j+1UL, zero );
5729 C.store( i+SIMDSIZE*4UL, j+1UL, zero );
5730 }
5731 }
5732
5733 if( j < N )
5734 {
5735 const size_t kbegin( ( IsLower_v<MT5> )
5736 ?( ( IsUpper_v<MT4> )
5737 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5738 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5739 :( IsUpper_v<MT4> ? i : 0UL ) );
5740 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
5741
5742 size_t k( kbegin );
5743
5744 if( k < kend )
5745 {
5746 SIMDType b1( set( B(k,j) ) );
5747 SIMDType xmm1( A.load(i ,k) * b1 );
5748 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
5749 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
5750 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
5751 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
5752
5753 for( ++k; k<kend; ++k ) {
5754 b1 = set( B(k,j) );
5755 xmm1 += A.load(i ,k) * b1;
5756 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5757 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5758 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5759 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5760 }
5761
5762 C.store( i , j, xmm1 * factor );
5763 C.store( i+SIMDSIZE , j, xmm2 * factor );
5764 C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5765 C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5766 C.store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5767 }
5768 else
5769 {
5770 const SIMDType zero;
5771 C.store( i , j, zero );
5772 C.store( i+SIMDSIZE , j, zero );
5773 C.store( i+SIMDSIZE*2UL, j, zero );
5774 C.store( i+SIMDSIZE*3UL, j, zero );
5775 C.store( i+SIMDSIZE*4UL, j, zero );
5776 }
5777 }
5778 }
5779
5780 for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5781 {
5782 const size_t jend( LOW ? min(i+SIMDSIZE*4UL,N) : N );
5783 size_t j( 0UL );
5784
5785 if( SYM || HERM ) {
5786 const size_t iiend( min(i+SIMDSIZE*4UL,M) );
5787 for( ; j<i; ++j ) {
5788 for( size_t ii=i; ii<iiend; ++ii ) {
5789 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
5790 }
5791 }
5792 }
5793 else if( UPP ) {
5794 const size_t iiend( min(i+SIMDSIZE*4UL,M) );
5795 for( ; j<i; ++j ) {
5796 for( size_t ii=i; ii<iiend; ++ii ) {
5797 reset( C(ii,j) );
5798 }
5799 }
5800 }
5801
5802 for( ; (j+2UL) <= jend; j+=2UL )
5803 {
5804 const size_t kbegin( ( IsLower_v<MT5> )
5805 ?( ( IsUpper_v<MT4> )
5806 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5807 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5808 :( IsUpper_v<MT4> ? i : 0UL ) );
5809 const size_t kend( ( IsUpper_v<MT5> )
5810 ?( ( IsLower_v<MT4> )
5811 ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5812 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5813 :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
5814
5815 size_t k( kbegin );
5816
5817 if( k < kend )
5818 {
5819 SIMDType a1( A.load(i ,k) );
5820 SIMDType a2( A.load(i+SIMDSIZE ,k) );
5821 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5822 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5823 SIMDType b1( set( B(k,j ) ) );
5824 SIMDType b2( set( B(k,j+1UL) ) );
5825 SIMDType xmm1( a1 * b1 );
5826 SIMDType xmm2( a2 * b1 );
5827 SIMDType xmm3( a3 * b1 );
5828 SIMDType xmm4( a4 * b1 );
5829 SIMDType xmm5( a1 * b2 );
5830 SIMDType xmm6( a2 * b2 );
5831 SIMDType xmm7( a3 * b2 );
5832 SIMDType xmm8( a4 * b2 );
5833
5834 for( ++k; k<kend; ++k ) {
5835 a1 = A.load(i ,k);
5836 a2 = A.load(i+SIMDSIZE ,k);
5837 a3 = A.load(i+SIMDSIZE*2UL,k);
5838 a4 = A.load(i+SIMDSIZE*3UL,k);
5839 b1 = set( B(k,j ) );
5840 b2 = set( B(k,j+1UL) );
5841 xmm1 += a1 * b1;
5842 xmm2 += a2 * b1;
5843 xmm3 += a3 * b1;
5844 xmm4 += a4 * b1;
5845 xmm5 += a1 * b2;
5846 xmm6 += a2 * b2;
5847 xmm7 += a3 * b2;
5848 xmm8 += a4 * b2;
5849 }
5850
5851 C.store( i , j , xmm1 * factor );
5852 C.store( i+SIMDSIZE , j , xmm2 * factor );
5853 C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5854 C.store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5855 C.store( i , j+1UL, xmm5 * factor );
5856 C.store( i+SIMDSIZE , j+1UL, xmm6 * factor );
5857 C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
5858 C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
5859 }
5860 else
5861 {
5862 const SIMDType zero;
5863 C.store( i , j , zero );
5864 C.store( i+SIMDSIZE , j , zero );
5865 C.store( i+SIMDSIZE*2UL, j , zero );
5866 C.store( i+SIMDSIZE*3UL, j , zero );
5867 C.store( i , j+1UL, zero );
5868 C.store( i+SIMDSIZE , j+1UL, zero );
5869 C.store( i+SIMDSIZE*2UL, j+1UL, zero );
5870 C.store( i+SIMDSIZE*3UL, j+1UL, zero );
5871 }
5872 }
5873
5874 if( j < jend )
5875 {
5876 const size_t kbegin( ( IsLower_v<MT5> )
5877 ?( ( IsUpper_v<MT4> )
5878 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5879 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5880 :( IsUpper_v<MT4> ? i : 0UL ) );
5881 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
5882
5883 size_t k( kbegin );
5884
5885 if( k < kend )
5886 {
5887 SIMDType b1( set( B(k,j) ) );
5888 SIMDType xmm1( A.load(i ,k) * b1 );
5889 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
5890 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
5891 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
5892
5893 for( ++k; k<kend; ++k ) {
5894 b1 = set( B(k,j) );
5895 xmm1 += A.load(i ,k) * b1;
5896 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5897 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5898 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5899 }
5900
5901 C.store( i , j, xmm1 * factor );
5902 C.store( i+SIMDSIZE , j, xmm2 * factor );
5903 C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5904 C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5905 }
5906 else
5907 {
5908 const SIMDType zero;
5909 C.store( i , j, zero );
5910 C.store( i+SIMDSIZE , j, zero );
5911 C.store( i+SIMDSIZE*2UL, j, zero );
5912 C.store( i+SIMDSIZE*3UL, j, zero );
5913 }
5914
5915 if( LOW ) ++j;
5916 }
5917
5918 if( LOW ) {
5919 const size_t iiend( min(i+SIMDSIZE*4UL,M) );
5920 for( ; j<N; ++j ) {
5921 for( size_t ii=i; ii<iiend; ++ii ) {
5922 reset( C(ii,j) );
5923 }
5924 }
5925 }
5926 }
5927
5928 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5929 {
5930 const size_t jend( LOW ? min(i+SIMDSIZE*3UL,N) : N );
5931 size_t j( 0UL );
5932
5933 if( SYM || HERM ) {
5934 const size_t iiend( min(i+SIMDSIZE*3UL,M) );
5935 for( ; j<i; ++j ) {
5936 for( size_t ii=i; ii<iiend; ++ii ) {
5937 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
5938 }
5939 }
5940 }
5941 else if( UPP ) {
5942 const size_t iiend( min(i+SIMDSIZE*3UL,M) );
5943 for( ; j<i; ++j ) {
5944 for( size_t ii=i; ii<iiend; ++ii ) {
5945 reset( C(ii,j) );
5946 }
5947 }
5948 }
5949
5950 for( ; (j+2UL) <= jend; j+=2UL )
5951 {
5952 const size_t kbegin( ( IsLower_v<MT5> )
5953 ?( ( IsUpper_v<MT4> )
5954 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5955 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5956 :( IsUpper_v<MT4> ? i : 0UL ) );
5957 const size_t kend( ( IsUpper_v<MT5> )
5958 ?( ( IsLower_v<MT4> )
5959 ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5960 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5961 :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
5962
5963 size_t k( kbegin );
5964
5965 if( k < kend )
5966 {
5967 SIMDType a1( A.load(i ,k) );
5968 SIMDType a2( A.load(i+SIMDSIZE ,k) );
5969 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5970 SIMDType b1( set( B(k,j ) ) );
5971 SIMDType b2( set( B(k,j+1UL) ) );
5972 SIMDType xmm1( a1 * b1 );
5973 SIMDType xmm2( a2 * b1 );
5974 SIMDType xmm3( a3 * b1 );
5975 SIMDType xmm4( a1 * b2 );
5976 SIMDType xmm5( a2 * b2 );
5977 SIMDType xmm6( a3 * b2 );
5978
5979 for( ++k; k<kend; ++k ) {
5980 a1 = A.load(i ,k);
5981 a2 = A.load(i+SIMDSIZE ,k);
5982 a3 = A.load(i+SIMDSIZE*2UL,k);
5983 b1 = set( B(k,j ) );
5984 b2 = set( B(k,j+1UL) );
5985 xmm1 += a1 * b1;
5986 xmm2 += a2 * b1;
5987 xmm3 += a3 * b1;
5988 xmm4 += a1 * b2;
5989 xmm5 += a2 * b2;
5990 xmm6 += a3 * b2;
5991 }
5992
5993 C.store( i , j , xmm1 * factor );
5994 C.store( i+SIMDSIZE , j , xmm2 * factor );
5995 C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5996 C.store( i , j+1UL, xmm4 * factor );
5997 C.store( i+SIMDSIZE , j+1UL, xmm5 * factor );
5998 C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
5999 }
6000 else
6001 {
6002 const SIMDType zero;
6003 C.store( i , j , zero );
6004 C.store( i+SIMDSIZE , j , zero );
6005 C.store( i+SIMDSIZE*2UL, j , zero );
6006 C.store( i , j+1UL, zero );
6007 C.store( i+SIMDSIZE , j+1UL, zero );
6008 C.store( i+SIMDSIZE*2UL, j+1UL, zero );
6009 }
6010 }
6011
6012 if( j < jend )
6013 {
6014 const size_t kbegin( ( IsLower_v<MT5> )
6015 ?( ( IsUpper_v<MT4> )
6016 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6017 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6018 :( IsUpper_v<MT4> ? i : 0UL ) );
6019 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
6020
6021 size_t k( kbegin );
6022
6023 if( k < kend )
6024 {
6025 SIMDType b1( set( B(k,j) ) );
6026 SIMDType xmm1( A.load(i ,k) * b1 );
6027 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
6028 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
6029
6030 for( ++k; k<kend; ++k ) {
6031 b1 = set( B(k,j) );
6032 xmm1 += A.load(i ,k) * b1;
6033 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6034 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6035 }
6036
6037 C.store( i , j, xmm1 * factor );
6038 C.store( i+SIMDSIZE , j, xmm2 * factor );
6039 C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
6040 }
6041 else
6042 {
6043 const SIMDType zero;
6044 C.store( i , j, zero );
6045 C.store( i+SIMDSIZE , j, zero );
6046 C.store( i+SIMDSIZE*2UL, j, zero );
6047 }
6048
6049 if( LOW ) ++j;
6050 }
6051
6052 if( LOW ) {
6053 const size_t iiend( min(i+SIMDSIZE*3UL,M) );
6054 for( ; j<N; ++j ) {
6055 for( size_t ii=i; ii<iiend; ++ii ) {
6056 reset( C(ii,j) );
6057 }
6058 }
6059 }
6060 }
6061
6062 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6063 {
6064 const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
6065 size_t j( 0UL );
6066
6067 if( SYM || HERM ) {
6068 const size_t iiend( min(i+SIMDSIZE*2UL,M) );
6069 for( ; j<i; ++j ) {
6070 for( size_t ii=i; ii<iiend; ++ii ) {
6071 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
6072 }
6073 }
6074 }
6075 else if( UPP ) {
6076 const size_t iiend( min(i+SIMDSIZE*2UL,M) );
6077 for( ; j<i; ++j ) {
6078 for( size_t ii=i; ii<iiend; ++ii ) {
6079 reset( C(ii,j) );
6080 }
6081 }
6082 }
6083
6084 for( ; (j+4UL) <= jend; j+=4UL )
6085 {
6086 const size_t kbegin( ( IsLower_v<MT5> )
6087 ?( ( IsUpper_v<MT4> )
6088 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6089 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6090 :( IsUpper_v<MT4> ? i : 0UL ) );
6091 const size_t kend( ( IsUpper_v<MT5> )
6092 ?( ( IsLower_v<MT4> )
6093 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
6094 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
6095 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6096
6097 size_t k( kbegin );
6098
6099 if( k < kend )
6100 {
6101 SIMDType a1( A.load(i ,k) );
6102 SIMDType a2( A.load(i+SIMDSIZE,k) );
6103 SIMDType b1( set( B(k,j ) ) );
6104 SIMDType b2( set( B(k,j+1UL) ) );
6105 SIMDType b3( set( B(k,j+2UL) ) );
6106 SIMDType b4( set( B(k,j+3UL) ) );
6107 SIMDType xmm1( a1 * b1 );
6108 SIMDType xmm2( a2 * b1 );
6109 SIMDType xmm3( a1 * b2 );
6110 SIMDType xmm4( a2 * b2 );
6111 SIMDType xmm5( a1 * b3 );
6112 SIMDType xmm6( a2 * b3 );
6113 SIMDType xmm7( a1 * b4 );
6114 SIMDType xmm8( a2 * b4 );
6115
6116 for( ++k; k<kend; ++k ) {
6117 a1 = A.load(i ,k);
6118 a2 = A.load(i+SIMDSIZE,k);
6119 b1 = set( B(k,j ) );
6120 b2 = set( B(k,j+1UL) );
6121 b3 = set( B(k,j+2UL) );
6122 b4 = set( B(k,j+3UL) );
6123 xmm1 += a1 * b1;
6124 xmm2 += a2 * b1;
6125 xmm3 += a1 * b2;
6126 xmm4 += a2 * b2;
6127 xmm5 += a1 * b3;
6128 xmm6 += a2 * b3;
6129 xmm7 += a1 * b4;
6130 xmm8 += a2 * b4;
6131 }
6132
6133 C.store( i , j , xmm1 * factor );
6134 C.store( i+SIMDSIZE, j , xmm2 * factor );
6135 C.store( i , j+1UL, xmm3 * factor );
6136 C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
6137 C.store( i , j+2UL, xmm5 * factor );
6138 C.store( i+SIMDSIZE, j+2UL, xmm6 * factor );
6139 C.store( i , j+3UL, xmm7 * factor );
6140 C.store( i+SIMDSIZE, j+3UL, xmm8 * factor );
6141 }
6142 else
6143 {
6144 const SIMDType zero;
6145 C.store( i , j , zero );
6146 C.store( i+SIMDSIZE, j , zero );
6147 C.store( i , j+1UL, zero );
6148 C.store( i+SIMDSIZE, j+1UL, zero );
6149 C.store( i , j+2UL, zero );
6150 C.store( i+SIMDSIZE, j+2UL, zero );
6151 C.store( i , j+3UL, zero );
6152 C.store( i+SIMDSIZE, j+3UL, zero );
6153 }
6154 }
6155
6156 for( ; (j+3UL) <= jend; j+=3UL )
6157 {
6158 const size_t kbegin( ( IsLower_v<MT5> )
6159 ?( ( IsUpper_v<MT4> )
6160 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6161 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6162 :( IsUpper_v<MT4> ? i : 0UL ) );
6163 const size_t kend( ( IsUpper_v<MT5> )
6164 ?( ( IsLower_v<MT4> )
6165 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
6166 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
6167 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6168
6169 size_t k( kbegin );
6170
6171 if( k < kend )
6172 {
6173 SIMDType a1( A.load(i ,k) );
6174 SIMDType a2( A.load(i+SIMDSIZE,k) );
6175 SIMDType b1( set( B(k,j ) ) );
6176 SIMDType b2( set( B(k,j+1UL) ) );
6177 SIMDType b3( set( B(k,j+2UL) ) );
6178 SIMDType xmm1( a1 * b1 );
6179 SIMDType xmm2( a2 * b1 );
6180 SIMDType xmm3( a1 * b2 );
6181 SIMDType xmm4( a2 * b2 );
6182 SIMDType xmm5( a1 * b3 );
6183 SIMDType xmm6( a2 * b3 );
6184
6185 for( ++k; k<kend; ++k ) {
6186 a1 = A.load(i ,k);
6187 a2 = A.load(i+SIMDSIZE,k);
6188 b1 = set( B(k,j ) );
6189 b2 = set( B(k,j+1UL) );
6190 b3 = set( B(k,j+2UL) );
6191 xmm1 += a1 * b1;
6192 xmm2 += a2 * b1;
6193 xmm3 += a1 * b2;
6194 xmm4 += a2 * b2;
6195 xmm5 += a1 * b3;
6196 xmm6 += a2 * b3;
6197 }
6198
6199 C.store( i , j , xmm1 * factor );
6200 C.store( i+SIMDSIZE, j , xmm2 * factor );
6201 C.store( i , j+1UL, xmm3 * factor );
6202 C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
6203 C.store( i , j+2UL, xmm5 * factor );
6204 C.store( i+SIMDSIZE, j+2UL, xmm6 * factor );
6205 }
6206 else
6207 {
6208 const SIMDType zero;
6209 C.store( i , j , zero );
6210 C.store( i+SIMDSIZE, j , zero );
6211 C.store( i , j+1UL, zero );
6212 C.store( i+SIMDSIZE, j+1UL, zero );
6213 C.store( i , j+2UL, zero );
6214 C.store( i+SIMDSIZE, j+2UL, zero );
6215 }
6216 }
6217
6218 for( ; (j+2UL) <= jend; j+=2UL )
6219 {
6220 const size_t kbegin( ( IsLower_v<MT5> )
6221 ?( ( IsUpper_v<MT4> )
6222 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6223 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6224 :( IsUpper_v<MT4> ? i : 0UL ) );
6225 const size_t kend( ( IsUpper_v<MT5> )
6226 ?( ( IsLower_v<MT4> )
6227 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6228 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6229 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6230
6231 size_t k( kbegin );
6232
6233 if( k < kend )
6234 {
6235 SIMDType a1( A.load(i ,k) );
6236 SIMDType a2( A.load(i+SIMDSIZE,k) );
6237 SIMDType b1( set( B(k,j ) ) );
6238 SIMDType b2( set( B(k,j+1UL) ) );
6239 SIMDType xmm1( a1 * b1 );
6240 SIMDType xmm2( a2 * b1 );
6241 SIMDType xmm3( a1 * b2 );
6242 SIMDType xmm4( a2 * b2 );
6243
6244 for( ++k; k<kend; ++k ) {
6245 a1 = A.load(i ,k);
6246 a2 = A.load(i+SIMDSIZE,k);
6247 b1 = set( B(k,j ) );
6248 b2 = set( B(k,j+1UL) );
6249 xmm1 += a1 * b1;
6250 xmm2 += a2 * b1;
6251 xmm3 += a1 * b2;
6252 xmm4 += a2 * b2;
6253 }
6254
6255 C.store( i , j , xmm1 * factor );
6256 C.store( i+SIMDSIZE, j , xmm2 * factor );
6257 C.store( i , j+1UL, xmm3 * factor );
6258 C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
6259 }
6260 else
6261 {
6262 const SIMDType zero;
6263 C.store( i , j , zero );
6264 C.store( i+SIMDSIZE, j , zero );
6265 C.store( i , j+1UL, zero );
6266 C.store( i+SIMDSIZE, j+1UL, zero );
6267 }
6268 }
6269
6270 if( j < jend )
6271 {
6272 const size_t kbegin( ( IsLower_v<MT5> )
6273 ?( ( IsUpper_v<MT4> )
6274 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6275 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6276 :( IsUpper_v<MT4> ? i : 0UL ) );
6277 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
6278
6279 size_t k( kbegin );
6280
6281 if( k < kend )
6282 {
6283 SIMDType b1( set( B(k,j) ) );
6284 SIMDType xmm1( A.load(i ,k) * b1 );
6285 SIMDType xmm2( A.load(i+SIMDSIZE,k) * b1 );
6286
6287 for( ++k; k<kend; ++k ) {
6288 b1 = set( B(k,j) );
6289 xmm1 += A.load(i ,k) * b1;
6290 xmm2 += A.load(i+SIMDSIZE,k) * b1;
6291 }
6292
6293 C.store( i , j, xmm1 * factor );
6294 C.store( i+SIMDSIZE, j, xmm2 * factor );
6295 }
6296 else
6297 {
6298 const SIMDType zero;
6299 C.store( i , j, zero );
6300 C.store( i+SIMDSIZE, j, zero );
6301 }
6302
6303 if( LOW ) ++j;
6304 }
6305
6306 if( LOW ) {
6307 const size_t iiend( min(i+SIMDSIZE*2UL,M) );
6308 for( ; j<N; ++j ) {
6309 for( size_t ii=i; ii<iiend; ++ii ) {
6310 reset( C(ii,j) );
6311 }
6312 }
6313 }
6314 }
6315
6316 for( ; i<ipos; i+=SIMDSIZE )
6317 {
6318 const size_t jend( LOW ? min(i+SIMDSIZE,N) : N );
6319 size_t j( 0UL );
6320
6321 if( SYM || HERM ) {
6322 const size_t iiend( min(i+SIMDSIZE,M) );
6323 for( ; j<i; ++j ) {
6324 for( size_t ii=i; ii<iiend; ++ii ) {
6325 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
6326 }
6327 }
6328 }
6329 else if( UPP ) {
6330 const size_t iiend( min(i+SIMDSIZE,M) );
6331 for( ; j<i; ++j ) {
6332 for( size_t ii=i; ii<iiend; ++ii ) {
6333 reset( C(ii,j) );
6334 }
6335 }
6336 }
6337
6338 for( ; (j+4UL) <= jend; j+=4UL )
6339 {
6340 const size_t kbegin( ( IsLower_v<MT5> )
6341 ?( ( IsUpper_v<MT4> )
6342 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6343 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6344 :( IsUpper_v<MT4> ? i : 0UL ) );
6345 const size_t kend( ( IsUpper_v<MT5> )
6346 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
6347 :( K ) );
6348
6349 size_t k( kbegin );
6350
6351 if( k < kend )
6352 {
6353 SIMDType a1( A.load(i,k) );
6354 SIMDType xmm1( a1 * set( B(k,j ) ) );
6355 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
6356 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
6357 SIMDType xmm4( a1 * set( B(k,j+3UL) ) );
6358
6359 for( ++k; k<kend; ++k ) {
6360 a1 = A.load(i,k);
6361 xmm1 += a1 * set( B(k,j ) );
6362 xmm2 += a1 * set( B(k,j+1UL) );
6363 xmm3 += a1 * set( B(k,j+2UL) );
6364 xmm4 += a1 * set( B(k,j+3UL) );
6365 }
6366
6367 C.store( i, j , xmm1 * factor );
6368 C.store( i, j+1UL, xmm2 * factor );
6369 C.store( i, j+2UL, xmm3 * factor );
6370 C.store( i, j+3UL, xmm4 * factor );
6371 }
6372 else
6373 {
6374 const SIMDType zero;
6375 C.store( i, j , zero );
6376 C.store( i, j+1UL, zero );
6377 C.store( i, j+2UL, zero );
6378 C.store( i, j+3UL, zero );
6379 }
6380 }
6381
6382 for( ; (j+3UL) <= jend; j+=3UL )
6383 {
6384 const size_t kbegin( ( IsLower_v<MT5> )
6385 ?( ( IsUpper_v<MT4> )
6386 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6387 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6388 :( IsUpper_v<MT4> ? i : 0UL ) );
6389 const size_t kend( ( IsUpper_v<MT5> )
6390 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
6391 :( K ) );
6392
6393 size_t k( kbegin );
6394
6395 if( k < kend )
6396 {
6397 SIMDType a1( A.load(i,k) );
6398 SIMDType xmm1( a1 * set( B(k,j ) ) );
6399 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
6400 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
6401
6402 for( ++k; k<kend; ++k ) {
6403 a1 = A.load(i,k);
6404 xmm1 += a1 * set( B(k,j ) );
6405 xmm2 += a1 * set( B(k,j+1UL) );
6406 xmm3 += a1 * set( B(k,j+2UL) );
6407 }
6408
6409 C.store( i, j , xmm1 * factor );
6410 C.store( i, j+1UL, xmm2 * factor );
6411 C.store( i, j+2UL, xmm3 * factor );
6412 }
6413 else
6414 {
6415 const SIMDType zero;
6416 C.store( i, j , zero );
6417 C.store( i, j+1UL, zero );
6418 C.store( i, j+2UL, zero );
6419 }
6420 }
6421
6422 for( ; (j+2UL) <= jend; j+=2UL )
6423 {
6424 const size_t kbegin( ( IsLower_v<MT5> )
6425 ?( ( IsUpper_v<MT4> )
6426 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6427 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6428 :( IsUpper_v<MT4> ? i : 0UL ) );
6429 const size_t kend( ( IsUpper_v<MT5> )
6430 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6431 :( K ) );
6432
6433 size_t k( kbegin );
6434
6435 if( k < kend )
6436 {
6437 SIMDType a1( A.load(i,k) );
6438 SIMDType xmm1( a1 * set( B(k,j ) ) );
6439 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
6440
6441 for( ++k; k<kend; ++k ) {
6442 a1 = A.load(i,k);
6443 xmm1 += a1 * set( B(k,j ) );
6444 xmm2 += a1 * set( B(k,j+1UL) );
6445 }
6446
6447 C.store( i, j , xmm1 * factor );
6448 C.store( i, j+1UL, xmm2 * factor );
6449 }
6450 else
6451 {
6452 const SIMDType zero;
6453 C.store( i, j , zero );
6454 C.store( i, j+1UL, zero );
6455 }
6456 }
6457
6458 if( j < jend )
6459 {
6460 const size_t kbegin( ( IsLower_v<MT5> )
6461 ?( ( IsUpper_v<MT4> )
6462 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6463 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6464 :( IsUpper_v<MT4> ? i : 0UL ) );
6465
6466 size_t k( kbegin );
6467
6468 if( k < K )
6469 {
6470 SIMDType xmm1( A.load(i,k) * set( B(k,j) ) );
6471
6472 for( ++k; k<K; ++k ) {
6473 xmm1 += A.load(i,k) * set( B(k,j) );
6474 }
6475
6476 C.store( i, j, xmm1 * factor );
6477 }
6478 else
6479 {
6480 const SIMDType zero;
6481 C.store( i, j, zero );
6482 }
6483
6484 if( LOW ) ++j;
6485 }
6486
6487 if( LOW ) {
6488 const size_t iiend( min(i+SIMDSIZE,M) );
6489 for( ; j<N; ++j ) {
6490 for( size_t ii=i; ii<iiend; ++ii ) {
6491 reset( C(ii,j) );
6492 }
6493 }
6494 }
6495 }
6496
6497 for( ; remainder && i<M; ++i )
6498 {
6499 size_t j( 0UL );
6500
6501 if( SYM || HERM ) {
6502 for( ; j<i; ++j ) {
6503 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
6504 }
6505 }
6506 else if( UPP ) {
6507 for( ; j<i; ++j ) {
6508 reset( C(i,j) );
6509 }
6510 }
6511
6512 for( ; (j+2UL) <= N; j+=2UL )
6513 {
6514 const size_t kbegin( ( IsLower_v<MT5> )
6515 ?( ( IsUpper_v<MT4> )
6516 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6517 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6518 :( IsUpper_v<MT4> ? i : 0UL ) );
6519 const size_t kend( ( IsUpper_v<MT5> )
6520 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6521 :( K ) );
6522
6523 size_t k( kbegin );
6524
6525 if( k < kend )
6526 {
6527 ElementType value1( A(i,k) * B(k,j ) );
6528 ElementType value2( A(i,k) * B(k,j+1UL) );
6529
6530 for( ++k; k<kend; ++k ) {
6531 value1 += A(i,k) * B(k,j );
6532 value2 += A(i,k) * B(k,j+1UL);
6533 }
6534
6535 C(i,j ) = value1 * scalar;
6536 C(i,j+1UL) = value2 * scalar;
6537 }
6538 else
6539 {
6540 reset( C(i,j ) );
6541 reset( C(i,j+1UL) );
6542 }
6543 }
6544
6545 if( j < N )
6546 {
6547 const size_t kbegin( ( IsLower_v<MT5> )
6548 ?( ( IsUpper_v<MT4> )
6549 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6550 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6551 :( IsUpper_v<MT4> ? i : 0UL ) );
6552
6553 size_t k( kbegin );
6554
6555 if( k < K )
6556 {
6557 ElementType value( A(i,k) * B(k,j) );
6558
6559 for( ++k; k<K; ++k ) {
6560 value += A(i,k) * B(k,j);
6561 }
6562
6563 C(i,j) = value * scalar;
6564 }
6565 else
6566 {
6567 reset( C(i,j) );
6568 }
6569 }
6570 }
6571 }
6572 //**********************************************************************************************
6573
6574 //**Default assignment to dense matrices (large matrices)***************************************
6588 template< typename MT3 // Type of the left-hand side target matrix
6589 , typename MT4 // Type of the left-hand side matrix operand
6590 , typename MT5 // Type of the right-hand side matrix operand
6591 , typename ST2 > // Type of the scalar value
6592 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6593 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6594 {
6595 selectDefaultAssignKernel( C, A, B, scalar );
6596 }
6597 //**********************************************************************************************
6598
6599 //**Vectorized default assignment to dense matrices (large matrices)****************************
6614 template< typename MT3 // Type of the left-hand side target matrix
6615 , typename MT4 // Type of the left-hand side matrix operand
6616 , typename MT5 // Type of the right-hand side matrix operand
6617 , typename ST2 > // Type of the scalar value
6618 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6619 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6620 {
6621 if( SYM )
6622 smmm( C, A, B, scalar );
6623 else if( HERM )
6624 hmmm( C, A, B, scalar );
6625 else if( LOW )
6626 lmmm( C, A, B, scalar, ST2(0) );
6627 else if( UPP )
6628 ummm( C, A, B, scalar, ST2(0) );
6629 else
6630 mmm( C, A, B, scalar, ST2(0) );
6631 }
6632 //**********************************************************************************************
6633
6634 //**BLAS-based assignment to dense matrices (default)*******************************************
6648 template< typename MT3 // Type of the left-hand side target matrix
6649 , typename MT4 // Type of the left-hand side matrix operand
6650 , typename MT5 // Type of the right-hand side matrix operand
6651 , typename ST2 > // Type of the scalar value
6652 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6653 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6654 {
6655 selectLargeAssignKernel( C, A, B, scalar );
6656 }
6657 //**********************************************************************************************
6658
6659 //**BLAS-based assignment to dense matrices*****************************************************
6660#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6674 template< typename MT3 // Type of the left-hand side target matrix
6675 , typename MT4 // Type of the left-hand side matrix operand
6676 , typename MT5 // Type of the right-hand side matrix operand
6677 , typename ST2 > // Type of the scalar value
6678 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6679 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6680 {
6681 using ET = ElementType_t<MT3>;
6682
6683 if( IsTriangular_v<MT4> ) {
6684 assign( C, B );
6685 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6686 }
6687 else if( IsTriangular_v<MT5> ) {
6688 assign( C, A );
6689 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6690 }
6691 else {
6692 gemm( C, A, B, ET(scalar), ET(0) );
6693 }
6694 }
6695#endif
6696 //**********************************************************************************************
6697
6698 //**Assignment to sparse matrices***************************************************************
6710 template< typename MT // Type of the target sparse matrix
6711 , bool SO > // Storage order of the target sparse matrix
6712 friend inline auto assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6713 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6714 {
6716
6717 using TmpType = If_t< SO, ResultType, OppositeType >;
6718
6725
6726 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6727 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6728
6729 const ForwardFunctor fwd;
6730
6731 const TmpType tmp( serial( rhs ) );
6732 assign( *lhs, fwd( tmp ) );
6733 }
6734 //**********************************************************************************************
6735
6736 //**Restructuring assignment to row-major matrices**********************************************
6750 template< typename MT > // Type of the target matrix
6751 friend inline auto assign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
6752 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6753 {
6755
6757
6758 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6759 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6760
6761 const ForwardFunctor fwd;
6762
6763 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
6764 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
6765
6766 assign( *lhs, fwd( A * B ) * rhs.scalar_ );
6767 }
6768 //**********************************************************************************************
6769
6770 //**Addition assignment to dense matrices*******************************************************
6782 template< typename MT // Type of the target dense matrix
6783 , bool SO > // Storage order of the target dense matrix
6784 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6785 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6786 {
6788
6789 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6790 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6791
6792 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6793 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6794
6795 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
6796 return;
6797 }
6798
6799 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6800 RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6801
6802 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6803 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6804 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6805 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6806 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
6807 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
6808
6809 DMatScalarMultExpr::selectAddAssignKernel( *lhs, A, B, rhs.scalar_ );
6810 }
6811 //**********************************************************************************************
6812
6813 //**Addition assignment to dense matrices (kernel selection)************************************
6824 template< typename MT3 // Type of the left-hand side target matrix
6825 , typename MT4 // Type of the left-hand side matrix operand
6826 , typename MT5 // Type of the right-hand side matrix operand
6827 , typename ST2 > // Type of the scalar value
6828 static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6829 {
6830 if( ( IsDiagonal_v<MT4> ) ||
6831 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
6832 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6833 selectSmallAddAssignKernel( C, A, B, scalar );
6834 else
6835 selectBlasAddAssignKernel( C, A, B, scalar );
6836 }
6837 //**********************************************************************************************
6838
6839 //**Default addition assignment to dense matrices (general/general)*****************************
6853 template< typename MT3 // Type of the left-hand side target matrix
6854 , typename MT4 // Type of the left-hand side matrix operand
6855 , typename MT5 // Type of the right-hand side matrix operand
6856 , typename ST2 > // Type of the scalar value
6857 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6858 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6859 {
6860 const ResultType tmp( serial( A * B * scalar ) );
6861 addAssign( C, tmp );
6862 }
6863 //**********************************************************************************************
6864
6865 //**Default addition assignment to dense matrices (general/diagonal)****************************
6879 template< typename MT3 // Type of the left-hand side target matrix
6880 , typename MT4 // Type of the left-hand side matrix operand
6881 , typename MT5 // Type of the right-hand side matrix operand
6882 , typename ST2 > // Type of the scalar value
6883 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6884 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6885 {
6887
6888 const size_t M( A.rows() );
6889 const size_t N( B.columns() );
6890
6891 for( size_t j=0UL; j<N; ++j )
6892 {
6893 const size_t ibegin( ( IsLower_v<MT4> )
6894 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
6895 :( 0UL ) );
6896 const size_t iend( ( IsUpper_v<MT4> )
6897 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
6898 :( M ) );
6899 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6900
6901 const size_t inum( iend - ibegin );
6902 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
6903 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
6904
6905 for( size_t i=ibegin; i<ipos; i+=2UL ) {
6906 C(i ,j) += A(i ,j) * B(j,j) * scalar;
6907 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
6908 }
6909 if( ipos < iend ) {
6910 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
6911 }
6912 }
6913 }
6914 //**********************************************************************************************
6915
6916 //**Default addition assignment to dense matrices (diagonal/general)****************************
6930 template< typename MT3 // Type of the left-hand side target matrix
6931 , typename MT4 // Type of the left-hand side matrix operand
6932 , typename MT5 // Type of the right-hand side matrix operand
6933 , typename ST2 > // Type of the scalar value
6934 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6935 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6936 {
6938
6939 const size_t M( A.rows() );
6940 const size_t N( B.columns() );
6941
6942 for( size_t j=0UL; j<N; ++j )
6943 {
6944 const size_t ibegin( ( IsLower_v<MT5> )
6945 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
6946 :( 0UL ) );
6947 const size_t iend( ( IsUpper_v<MT5> )
6948 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
6949 :( M ) );
6950 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6951
6952 const size_t inum( iend - ibegin );
6953 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
6954 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
6955
6956 for( size_t i=ibegin; i<ipos; i+=2UL ) {
6957 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6958 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6959 }
6960 if( ipos < iend ) {
6961 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6962 }
6963 }
6964 }
6965 //**********************************************************************************************
6966
6967 //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
6981 template< typename MT3 // Type of the left-hand side target matrix
6982 , typename MT4 // Type of the left-hand side matrix operand
6983 , typename MT5 // Type of the right-hand side matrix operand
6984 , typename ST2 > // Type of the scalar value
6985 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6986 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6987 {
6989
6990 for( size_t i=0UL; i<A.rows(); ++i ) {
6991 C(i,i) += A(i,i) * B(i,i) * scalar;
6992 }
6993 }
6994 //**********************************************************************************************
6995
6996 //**Default addition assignment to dense matrices (small matrices)******************************
7010 template< typename MT3 // Type of the left-hand side target matrix
7011 , typename MT4 // Type of the left-hand side matrix operand
7012 , typename MT5 // Type of the right-hand side matrix operand
7013 , typename ST2 > // Type of the scalar value
7014 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7015 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7016 {
7017 selectDefaultAddAssignKernel( C, A, B, scalar );
7018 }
7019 //**********************************************************************************************
7020
7021 //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
7036 template< typename MT3 // Type of the left-hand side target matrix
7037 , typename MT4 // Type of the left-hand side matrix operand
7038 , typename MT5 // Type of the right-hand side matrix operand
7039 , typename ST2 > // Type of the scalar value
7040 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7041 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7042 {
7047
7048 const ForwardFunctor fwd;
7049
7050 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
7051 const OppositeType_t<MT5> tmp( serial( B ) );
7052 addAssign( C, fwd( A * tmp ) * scalar );
7053 }
7054 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
7055 const OppositeType_t<MT4> tmp( serial( A ) );
7056 addAssign( C, fwd( tmp * B ) * scalar );
7057 }
7058 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
7059 const OppositeType_t<MT5> tmp( serial( B ) );
7060 addAssign( C, fwd( A * tmp ) * scalar );
7061 }
7062 else {
7063 const OppositeType_t<MT4> tmp( serial( A ) );
7064 addAssign( C, fwd( tmp * B ) * scalar );
7065 }
7066 }
7067 //**********************************************************************************************
7068
7069 //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
7084 template< typename MT3 // Type of the left-hand side target matrix
7085 , typename MT4 // Type of the left-hand side matrix operand
7086 , typename MT5 // Type of the right-hand side matrix operand
7087 , typename ST2 > // Type of the scalar value
7088 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7089 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7090 {
7091 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
7092
7093 const size_t M( A.rows() );
7094 const size_t N( B.columns() );
7095 const size_t K( A.columns() );
7096
7097 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7098
7099 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
7100 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
7101
7102 const SIMDType factor( set( scalar ) );
7103
7104 size_t i( 0UL );
7105
7106 if( IsIntegral_v<ElementType> )
7107 {
7108 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
7109 for( size_t j=0UL; j<N; ++j )
7110 {
7111 const size_t kbegin( ( IsLower_v<MT5> )
7112 ?( ( IsUpper_v<MT4> )
7113 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7114 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7115 :( IsUpper_v<MT4> ? i : 0UL ) );
7116 const size_t kend( ( IsUpper_v<MT5> )
7117 ?( ( IsLower_v<MT4> )
7118 ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
7119 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
7120 :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
7121
7122 size_t k( kbegin );
7123
7124 if( k < kend )
7125 {
7126 SIMDType b1( set( B(k,j) ) );
7127 SIMDType xmm1( A.load(i ,k) * b1 );
7128 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
7129 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
7130 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
7131 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
7132 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,k) * b1 );
7133 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,k) * b1 );
7134 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,k) * b1 );
7135
7136 for( ++k; k<kend; ++k ) {
7137 b1 = set( B(k,j) );
7138 xmm1 += A.load(i ,k) * b1;
7139 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7140 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7141 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7142 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7143 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
7144 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
7145 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
7146 }
7147
7148 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7149 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
7150 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
7151 C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
7152 C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
7153 C.store( i+SIMDSIZE*5UL, j, C.load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
7154 C.store( i+SIMDSIZE*6UL, j, C.load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
7155 C.store( i+SIMDSIZE*7UL, j, C.load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
7156 }
7157 }
7158 }
7159 }
7160
7161 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
7162 {
7163 size_t j( 0UL );
7164
7165 for( ; (j+2UL) <= N; j+=2UL )
7166 {
7167 const size_t kbegin( ( IsLower_v<MT5> )
7168 ?( ( IsUpper_v<MT4> )
7169 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7170 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7171 :( IsUpper_v<MT4> ? i : 0UL ) );
7172 const size_t kend( ( IsUpper_v<MT5> )
7173 ?( ( IsLower_v<MT4> )
7174 ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7175 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7176 :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
7177
7178 size_t k( kbegin );
7179
7180 if( k < kend )
7181 {
7182 SIMDType a1( A.load(i ,k) );
7183 SIMDType a2( A.load(i+SIMDSIZE ,k) );
7184 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7185 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7186 SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
7187 SIMDType b1( set( B(k,j ) ) );
7188 SIMDType b2( set( B(k,j+1UL) ) );
7189 SIMDType xmm1 ( a1 * b1 );
7190 SIMDType xmm2 ( a2 * b1 );
7191 SIMDType xmm3 ( a3 * b1 );
7192 SIMDType xmm4 ( a4 * b1 );
7193 SIMDType xmm5 ( a5 * b1 );
7194 SIMDType xmm6 ( a1 * b2 );
7195 SIMDType xmm7 ( a2 * b2 );
7196 SIMDType xmm8 ( a3 * b2 );
7197 SIMDType xmm9 ( a4 * b2 );
7198 SIMDType xmm10( a5 * b2 );
7199
7200 for( ++k; k<kend; ++k ) {
7201 a1 = A.load(i ,k);
7202 a2 = A.load(i+SIMDSIZE ,k);
7203 a3 = A.load(i+SIMDSIZE*2UL,k);
7204 a4 = A.load(i+SIMDSIZE*3UL,k);
7205 a5 = A.load(i+SIMDSIZE*4UL,k);
7206 b1 = set( B(k,j ) );
7207 b2 = set( B(k,j+1UL) );
7208 xmm1 += a1 * b1;
7209 xmm2 += a2 * b1;
7210 xmm3 += a3 * b1;
7211 xmm4 += a4 * b1;
7212 xmm5 += a5 * b1;
7213 xmm6 += a1 * b2;
7214 xmm7 += a2 * b2;
7215 xmm8 += a3 * b2;
7216 xmm9 += a4 * b2;
7217 xmm10 += a5 * b2;
7218 }
7219
7220 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7221 C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
7222 C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
7223 C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
7224 C.store( i+SIMDSIZE*4UL, j , C.load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
7225 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm6 * factor );
7226 C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
7227 C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
7228 C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
7229 C.store( i+SIMDSIZE*4UL, j+1UL, C.load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
7230 }
7231 }
7232
7233 if( j < N )
7234 {
7235 const size_t kbegin( ( IsLower_v<MT5> )
7236 ?( ( IsUpper_v<MT4> )
7237 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7238 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7239 :( IsUpper_v<MT4> ? i : 0UL ) );
7240 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
7241
7242 size_t k( kbegin );
7243
7244 if( k < kend )
7245 {
7246 SIMDType b1( set( B(k,j) ) );
7247 SIMDType xmm1( A.load(i ,k) * b1 );
7248 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
7249 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
7250 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
7251 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
7252
7253 for( ++k; k<kend; ++k ) {
7254 b1 = set( B(k,j) );
7255 xmm1 += A.load(i ,k) * b1;
7256 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7257 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7258 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7259 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7260 }
7261
7262 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7263 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
7264 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
7265 C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
7266 C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
7267 }
7268 }
7269 }
7270
7271 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7272 {
7273 size_t j( 0UL );
7274
7275 for( ; (j+2UL) <= N; j+=2UL )
7276 {
7277 const size_t kbegin( ( IsLower_v<MT5> )
7278 ?( ( IsUpper_v<MT4> )
7279 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7280 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7281 :( IsUpper_v<MT4> ? i : 0UL ) );
7282 const size_t kend( ( IsUpper_v<MT5> )
7283 ?( ( IsLower_v<MT4> )
7284 ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7285 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7286 :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
7287
7288 size_t k( kbegin );
7289
7290 if( k < kend )
7291 {
7292 SIMDType a1( A.load(i ,k) );
7293 SIMDType a2( A.load(i+SIMDSIZE ,k) );
7294 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7295 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7296 SIMDType b1( set( B(k,j ) ) );
7297 SIMDType b2( set( B(k,j+1UL) ) );
7298 SIMDType xmm1( a1 * b1 );
7299 SIMDType xmm2( a2 * b1 );
7300 SIMDType xmm3( a3 * b1 );
7301 SIMDType xmm4( a4 * b1 );
7302 SIMDType xmm5( a1 * b2 );
7303 SIMDType xmm6( a2 * b2 );
7304 SIMDType xmm7( a3 * b2 );
7305 SIMDType xmm8( a4 * b2 );
7306
7307 for( ++k; k<kend; ++k ) {
7308 a1 = A.load(i ,k);
7309 a2 = A.load(i+SIMDSIZE ,k);
7310 a3 = A.load(i+SIMDSIZE*2UL,k);
7311 a4 = A.load(i+SIMDSIZE*3UL,k);
7312 b1 = set( B(k,j ) );
7313 b2 = set( B(k,j+1UL) );
7314 xmm1 += a1 * b1;
7315 xmm2 += a2 * b1;
7316 xmm3 += a3 * b1;
7317 xmm4 += a4 * b1;
7318 xmm5 += a1 * b2;
7319 xmm6 += a2 * b2;
7320 xmm7 += a3 * b2;
7321 xmm8 += a4 * b2;
7322 }
7323
7324 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7325 C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
7326 C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
7327 C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
7328 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm5 * factor );
7329 C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
7330 C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
7331 C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
7332 }
7333 }
7334
7335 if( j < N )
7336 {
7337 const size_t kbegin( ( IsLower_v<MT5> )
7338 ?( ( IsUpper_v<MT4> )
7339 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7340 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7341 :( IsUpper_v<MT4> ? i : 0UL ) );
7342 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
7343
7344 size_t k( kbegin );
7345
7346 if( k < kend )
7347 {
7348 SIMDType b1( set( B(k,j) ) );
7349 SIMDType xmm1( A.load(i ,k) * b1 );
7350 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
7351 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
7352 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
7353
7354 for( ++k; k<kend; ++k ) {
7355 b1 = set( B(k,j) );
7356 xmm1 += A.load(i ,k) * b1;
7357 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7358 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7359 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7360 }
7361
7362 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7363 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
7364 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
7365 C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
7366 }
7367 }
7368 }
7369
7370 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
7371 {
7372 size_t j( 0UL );
7373
7374 for( ; (j+2UL) <= N; j+=2UL )
7375 {
7376 const size_t kbegin( ( IsLower_v<MT5> )
7377 ?( ( IsUpper_v<MT4> )
7378 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7379 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7380 :( IsUpper_v<MT4> ? i : 0UL ) );
7381 const size_t kend( ( IsUpper_v<MT5> )
7382 ?( ( IsLower_v<MT4> )
7383 ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7384 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7385 :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
7386
7387 size_t k( kbegin );
7388
7389 if( k < kend )
7390 {
7391 SIMDType a1( A.load(i ,k) );
7392 SIMDType a2( A.load(i+SIMDSIZE ,k) );
7393 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7394 SIMDType b1( set( B(k,j ) ) );
7395 SIMDType b2( set( B(k,j+1UL) ) );
7396 SIMDType xmm1( a1 * b1 );
7397 SIMDType xmm2( a2 * b1 );
7398 SIMDType xmm3( a3 * b1 );
7399 SIMDType xmm4( a1 * b2 );
7400 SIMDType xmm5( a2 * b2 );
7401 SIMDType xmm6( a3 * b2 );
7402
7403 for( ++k; k<kend; ++k ) {
7404 a1 = A.load(i ,k);
7405 a2 = A.load(i+SIMDSIZE ,k);
7406 a3 = A.load(i+SIMDSIZE*2UL,k);
7407 b1 = set( B(k,j ) );
7408 b2 = set( B(k,j+1UL) );
7409 xmm1 += a1 * b1;
7410 xmm2 += a2 * b1;
7411 xmm3 += a3 * b1;
7412 xmm4 += a1 * b2;
7413 xmm5 += a2 * b2;
7414 xmm6 += a3 * b2;
7415 }
7416
7417 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7418 C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
7419 C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
7420 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm4 * factor );
7421 C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
7422 C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
7423 }
7424 }
7425
7426 if( j < N )
7427 {
7428 const size_t kbegin( ( IsLower_v<MT5> )
7429 ?( ( IsUpper_v<MT4> )
7430 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7431 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7432 :( IsUpper_v<MT4> ? i : 0UL ) );
7433 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
7434
7435 size_t k( kbegin );
7436
7437 if( k < kend )
7438 {
7439 SIMDType b1( set( B(k,j) ) );
7440 SIMDType xmm1( A.load(i ,k) * b1 );
7441 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
7442 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
7443
7444 for( ++k; k<kend; ++k ) {
7445 b1 = set( B(k,j) );
7446 xmm1 += A.load(i ,k) * b1;
7447 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7448 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7449 }
7450
7451 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7452 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
7453 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
7454 }
7455 }
7456 }
7457
7458 for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7459 {
7460 const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
7461 size_t j( UPP ? i : 0UL );
7462
7463 for( ; (j+4UL) <= jend; j+=4UL )
7464 {
7465 const size_t kbegin( ( IsLower_v<MT5> )
7466 ?( ( IsUpper_v<MT4> )
7467 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7468 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7469 :( IsUpper_v<MT4> ? i : 0UL ) );
7470 const size_t kend( ( IsUpper_v<MT5> )
7471 ?( ( IsLower_v<MT4> )
7472 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
7473 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
7474 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
7475
7476 size_t k( kbegin );
7477
7478 if( k < kend )
7479 {
7480 SIMDType a1( A.load(i ,k) );
7481 SIMDType a2( A.load(i+SIMDSIZE,k) );
7482 SIMDType b1( set( B(k,j ) ) );
7483 SIMDType b2( set( B(k,j+1UL) ) );
7484 SIMDType b3( set( B(k,j+2UL) ) );
7485 SIMDType b4( set( B(k,j+3UL) ) );
7486 SIMDType xmm1( a1 * b1 );
7487 SIMDType xmm2( a2 * b1 );
7488 SIMDType xmm3( a1 * b2 );
7489 SIMDType xmm4( a2 * b2 );
7490 SIMDType xmm5( a1 * b3 );
7491 SIMDType xmm6( a2 * b3 );
7492 SIMDType xmm7( a1 * b4 );
7493 SIMDType xmm8( a2 * b4 );
7494
7495 for( ++k; k<kend; ++k ) {
7496 a1 = A.load(i ,k);
7497 a2 = A.load(i+SIMDSIZE,k);
7498 b1 = set( B(k,j ) );
7499 b2 = set( B(k,j+1UL) );
7500 b3 = set( B(k,j+2UL) );
7501 b4 = set( B(k,j+3UL) );
7502 xmm1 += a1 * b1;
7503 xmm2 += a2 * b1;
7504 xmm3 += a1 * b2;
7505 xmm4 += a2 * b2;
7506 xmm5 += a1 * b3;
7507 xmm6 += a2 * b3;
7508 xmm7 += a1 * b4;
7509 xmm8 += a2 * b4;
7510 }
7511
7512 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7513 C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
7514 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
7515 C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
7516 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
7517 C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
7518 C.store( i , j+3UL, C.load(i ,j+3UL) + xmm7 * factor );
7519 C.store( i+SIMDSIZE, j+3UL, C.load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
7520 }
7521 }
7522
7523 for( ; (j+3UL) <= jend; j+=3UL )
7524 {
7525 const size_t kbegin( ( IsLower_v<MT5> )
7526 ?( ( IsUpper_v<MT4> )
7527 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7528 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7529 :( IsUpper_v<MT4> ? i : 0UL ) );
7530 const size_t kend( ( IsUpper_v<MT5> )
7531 ?( ( IsLower_v<MT4> )
7532 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
7533 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
7534 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
7535
7536 size_t k( kbegin );
7537
7538 if( k < kend )
7539 {
7540 SIMDType a1( A.load(i ,k) );
7541 SIMDType a2( A.load(i+SIMDSIZE,k) );
7542 SIMDType b1( set( B(k,j ) ) );
7543 SIMDType b2( set( B(k,j+1UL) ) );
7544 SIMDType b3( set( B(k,j+2UL) ) );
7545 SIMDType xmm1( a1 * b1 );
7546 SIMDType xmm2( a2 * b1 );
7547 SIMDType xmm3( a1 * b2 );
7548 SIMDType xmm4( a2 * b2 );
7549 SIMDType xmm5( a1 * b3 );
7550 SIMDType xmm6( a2 * b3 );
7551
7552 for( ++k; k<kend; ++k ) {
7553 a1 = A.load(i ,k);
7554 a2 = A.load(i+SIMDSIZE,k);
7555 b1 = set( B(k,j ) );
7556 b2 = set( B(k,j+1UL) );
7557 b3 = set( B(k,j+2UL) );
7558 xmm1 += a1 * b1;
7559 xmm2 += a2 * b1;
7560 xmm3 += a1 * b2;
7561 xmm4 += a2 * b2;
7562 xmm5 += a1 * b3;
7563 xmm6 += a2 * b3;
7564 }
7565
7566 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7567 C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
7568 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
7569 C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
7570 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
7571 C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
7572 }
7573 }
7574
7575 for( ; (j+2UL) <= jend; j+=2UL )
7576 {
7577 const size_t kbegin( ( IsLower_v<MT5> )
7578 ?( ( IsUpper_v<MT4> )
7579 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7580 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7581 :( IsUpper_v<MT4> ? i : 0UL ) );
7582 const size_t kend( ( IsUpper_v<MT5> )
7583 ?( ( IsLower_v<MT4> )
7584 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7585 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7586 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
7587
7588 size_t k( kbegin );
7589
7590 if( k < kend )
7591 {
7592 SIMDType a1( A.load(i ,k) );
7593 SIMDType a2( A.load(i+SIMDSIZE,k) );
7594 SIMDType b1( set( B(k,j ) ) );
7595 SIMDType b2( set( B(k,j+1UL) ) );
7596 SIMDType xmm1( a1 * b1 );
7597 SIMDType xmm2( a2 * b1 );
7598 SIMDType xmm3( a1 * b2 );
7599 SIMDType xmm4( a2 * b2 );
7600
7601 for( ++k; k<kend; ++k ) {
7602 a1 = A.load(i ,k);
7603 a2 = A.load(i+SIMDSIZE,k);
7604 b1 = set( B(k,j ) );
7605 b2 = set( B(k,j+1UL) );
7606 xmm1 += a1 * b1;
7607 xmm2 += a2 * b1;
7608 xmm3 += a1 * b2;
7609 xmm4 += a2 * b2;
7610 }
7611
7612 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7613 C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
7614 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
7615 C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
7616 }
7617 }
7618
7619 if( j < jend )
7620 {
7621 const size_t kbegin( ( IsLower_v<MT5> )
7622 ?( ( IsUpper_v<MT4> )
7623 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7624 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7625 :( IsUpper_v<MT4> ? i : 0UL ) );
7626 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
7627
7628 size_t k( kbegin );
7629
7630 if( k < kend )
7631 {
7632 SIMDType b1( set( B(k,j) ) );
7633 SIMDType xmm1( A.load(i ,k) * b1 );
7634 SIMDType xmm2( A.load(i+SIMDSIZE,k) * b1 );
7635
7636 for( ++k; k<kend; ++k ) {
7637 b1 = set( B(k,j) );
7638 xmm1 += A.load(i ,k) * b1;
7639 xmm2 += A.load(i+SIMDSIZE,k) * b1;
7640 }
7641
7642 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7643 C.store( i+SIMDSIZE, j, C.load(i+SIMDSIZE,j) + xmm2 * factor );
7644 }
7645 }
7646 }
7647
7648 for( ; i<ipos; i+=SIMDSIZE )
7649 {
7650 const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
7651 size_t j( UPP ? i : 0UL );
7652
7653 for( ; (j+4UL) <= jend; j+=4UL )
7654 {
7655 const size_t kbegin( ( IsLower_v<MT5> )
7656 ?( ( IsUpper_v<MT4> )
7657 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7658 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7659 :( IsUpper_v<MT4> ? i : 0UL ) );
7660 const size_t kend( ( IsUpper_v<MT5> )
7661 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
7662 :( K ) );
7663
7664 size_t k( kbegin );
7665
7666 if( k < kend )
7667 {
7668 SIMDType a1( A.load(i,k) );
7669 SIMDType xmm1( a1 * set( B(k,j ) ) );
7670 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
7671 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
7672 SIMDType xmm4( a1 * set( B(k,j+3UL) ) );
7673
7674 for( ++k; k<kend; ++k ) {
7675 a1 = A.load(i,k);
7676 xmm1 += a1 * set( B(k,j ) );
7677 xmm2 += a1 * set( B(k,j+1UL) );
7678 xmm3 += a1 * set( B(k,j+2UL) );
7679 xmm4 += a1 * set( B(k,j+3UL) );
7680 }
7681
7682 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7683 C.store( i, j+1UL, C.load(i,j+1UL) + xmm2 * factor );
7684 C.store( i, j+2UL, C.load(i,j+2UL) + xmm3 * factor );
7685 C.store( i, j+3UL, C.load(i,j+3UL) + xmm4 * factor );
7686 }
7687 }
7688
7689 for( ; (j+3UL) <= jend; j+=3UL )
7690 {
7691 const size_t kbegin( ( IsLower_v<MT5> )
7692 ?( ( IsUpper_v<MT4> )
7693 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7694 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7695 :( IsUpper_v<MT4> ? i : 0UL ) );
7696 const size_t kend( ( IsUpper_v<MT5> )
7697 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
7698 :( K ) );
7699
7700 size_t k( kbegin );
7701
7702 if( k < kend )
7703 {
7704 SIMDType a1( A.load(i,k) );
7705 SIMDType xmm1( a1 * set( B(k,j ) ) );
7706 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
7707 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
7708
7709 for( ++k; k<kend; ++k ) {
7710 a1 = A.load(i,k);
7711 xmm1 += a1 * set( B(k,j ) );
7712 xmm2 += a1 * set( B(k,j+1UL) );
7713 xmm3 += a1 * set( B(k,j+2UL) );
7714 }
7715
7716 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7717 C.store( i, j+1UL, C.load(i,j+1UL) + xmm2 * factor );
7718 C.store( i, j+2UL, C.load(i,j+2UL) + xmm3 * factor );
7719 }
7720 }
7721
7722 for( ; (j+2UL) <= jend; j+=2UL )
7723 {
7724 const size_t kbegin( ( IsLower_v<MT5> )
7725 ?( ( IsUpper_v<MT4> )
7726 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7727 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7728 :( IsUpper_v<MT4> ? i : 0UL ) );
7729 const size_t kend( ( IsUpper_v<MT5> )
7730 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
7731 :( K ) );
7732
7733 size_t k( kbegin );
7734
7735 if( k < kend )
7736 {
7737 SIMDType a1( A.load(i,k) );
7738 SIMDType xmm1( a1 * set( B(k,j ) ) );
7739 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
7740
7741 for( ++k; k<kend; ++k ) {
7742 a1 = A.load(i,k);
7743 xmm1 += a1 * set( B(k,j ) );
7744 xmm2 += a1 * set( B(k,j+1UL) );
7745 }
7746
7747 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7748 C.store( i, j+1UL, C.load(i,j+1UL) + xmm2 * factor );
7749 }
7750 }
7751
7752 if( j < jend )
7753 {
7754 const size_t kbegin( ( IsLower_v<MT5> )
7755 ?( ( IsUpper_v<MT4> )
7756 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7757 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7758 :( IsUpper_v<MT4> ? i : 0UL ) );
7759
7760 size_t k( kbegin );
7761
7762 if( k < K )
7763 {
7764 SIMDType xmm1( A.load(i,k) * set( B(k,j) ) );
7765
7766 for( ++k; k<K; ++k ) {
7767 xmm1 += A.load(i,k) * set( B(k,j) );
7768 }
7769
7770 C.store( i, j, C.load(i,j) + xmm1 * factor );
7771 }
7772 }
7773 }
7774
7775 for( ; remainder && i<M; ++i )
7776 {
7777 const size_t jend( LOW ? i+1UL : N );
7778 size_t j( UPP ? i : 0UL );
7779
7780 for( ; (j+2UL) <= jend; j+=2UL )
7781 {
7782 const size_t kbegin( ( IsLower_v<MT5> )
7783 ?( ( IsUpper_v<MT4> )
7784 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7785 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7786 :( IsUpper_v<MT4> ? i : 0UL ) );
7787 const size_t kend( ( IsUpper_v<MT5> )
7788 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
7789 :( K ) );
7790
7791 size_t k( kbegin );
7792
7793 if( k < kend )
7794 {
7795 ElementType value1( A(i,k) * B(k,j ) );
7796 ElementType value2( A(i,k) * B(k,j+1UL) );
7797
7798 for( ++k; k<kend; ++k ) {
7799 value1 += A(i,k) * B(k,j );
7800 value2 += A(i,k) * B(k,j+1UL);
7801 }
7802
7803 C(i,j ) += value1 * scalar;
7804 C(i,j+1UL) += value2 * scalar;
7805 }
7806 }
7807
7808 if( j < jend )
7809 {
7810 const size_t kbegin( ( IsLower_v<MT5> )
7811 ?( ( IsUpper_v<MT4> )
7812 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7813 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7814 :( IsUpper_v<MT4> ? i : 0UL ) );
7815
7816 size_t k( kbegin );
7817
7818 if( k < K )
7819 {
7820 ElementType value( A(i,k) * B(k,j) );
7821
7822 for( ++k; k<K; ++k ) {
7823 value += A(i,k) * B(k,j);
7824 }
7825
7826 C(i,j) += value * scalar;
7827 }
7828 }
7829 }
7830 }
7831 //**********************************************************************************************
7832
7833 //**Default addition assignment to dense matrices (large matrices)******************************
7847 template< typename MT3 // Type of the left-hand side target matrix
7848 , typename MT4 // Type of the left-hand side matrix operand
7849 , typename MT5 // Type of the right-hand side matrix operand
7850 , typename ST2 > // Type of the scalar value
7851 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7852 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7853 {
7854 selectDefaultAddAssignKernel( C, A, B, scalar );
7855 }
7856 //**********************************************************************************************
7857
7858 //**Vectorized default addition assignment to dense matrices (large matrices)*******************
7873 template< typename MT3 // Type of the left-hand side target matrix
7874 , typename MT4 // Type of the left-hand side matrix operand
7875 , typename MT5 // Type of the right-hand side matrix operand
7876 , typename ST2 > // Type of the scalar value
7877 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7878 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7879 {
7880 if( LOW )
7881 lmmm( C, A, B, scalar, ST2(1) );
7882 else if( UPP )
7883 ummm( C, A, B, scalar, ST2(1) );
7884 else
7885 mmm( C, A, B, scalar, ST2(1) );
7886 }
7887 //**********************************************************************************************
7888
7889 //**BLAS-based addition assignment to dense matrices (default)**********************************
7904 template< typename MT3 // Type of the left-hand side target matrix
7905 , typename MT4 // Type of the left-hand side matrix operand
7906 , typename MT5 // Type of the right-hand side matrix operand
7907 , typename ST2 > // Type of the scalar value
7908 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7909 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7910 {
7911 selectLargeAddAssignKernel( C, A, B, scalar );
7912 }
7913 //**********************************************************************************************
7914
7915 //**BLAS-based addition assignment to dense matrices********************************************
7916#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7930 template< typename MT3 // Type of the left-hand side target matrix
7931 , typename MT4 // Type of the left-hand side matrix operand
7932 , typename MT5 // Type of the right-hand side matrix operand
7933 , typename ST2 > // Type of the scalar value
7934 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7935 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7936 {
7937 using ET = ElementType_t<MT3>;
7938
7939 if( IsTriangular_v<MT4> ) {
7940 ResultType_t<MT3> tmp( serial( B ) );
7941 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7942 addAssign( C, tmp );
7943 }
7944 else if( IsTriangular_v<MT5> ) {
7945 ResultType_t<MT3> tmp( serial( A ) );
7946 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7947 addAssign( C, tmp );
7948 }
7949 else {
7950 gemm( C, A, B, ET(scalar), ET(1) );
7951 }
7952 }
7953#endif
7954 //**********************************************************************************************
7955
7956 //**Restructuring addition assignment to row-major matrices*************************************
7971 template< typename MT > // Type of the target matrix
7972 friend inline auto addAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7973 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7974 {
7976
7978
7979 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
7980 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
7981
7982 const ForwardFunctor fwd;
7983
7984 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
7985 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
7986
7987 addAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
7988 }
7989 //**********************************************************************************************
7990
7991 //**Addition assignment to sparse matrices******************************************************
7992 // No special implementation for the addition assignment to sparse matrices.
7993 //**********************************************************************************************
7994
7995 //**Subtraction assignment to dense matrices****************************************************
8007 template< typename MT // Type of the target dense matrix
8008 , bool SO > // Storage order of the target dense matrix
8009 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8010 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8011 {
8013
8014 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
8015 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
8016
8017 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8018 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8019
8020 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
8021 return;
8022 }
8023
8024 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
8025 RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
8026
8027 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8028 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8029 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8030 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8031 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
8032 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
8033
8034 DMatScalarMultExpr::selectSubAssignKernel( *lhs, A, B, rhs.scalar_ );
8035 }
8036 //**********************************************************************************************
8037
8038 //**Subtraction assignment to dense matrices (kernel selection)*********************************
8049 template< typename MT3 // Type of the left-hand side target matrix
8050 , typename MT4 // Type of the left-hand side matrix operand
8051 , typename MT5 // Type of the right-hand side matrix operand
8052 , typename ST2 > // Type of the scalar value
8053 static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8054 {
8055 if( ( IsDiagonal_v<MT4> ) ||
8056 ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
8057 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
8058 selectSmallSubAssignKernel( C, A, B, scalar );
8059 else
8060 selectBlasSubAssignKernel( C, A, B, scalar );
8061 }
8062 //**********************************************************************************************
8063
8064 //**Default subtraction assignment to dense matrices (general/general)**************************
8078 template< typename MT3 // Type of the left-hand side target matrix
8079 , typename MT4 // Type of the left-hand side matrix operand
8080 , typename MT5 // Type of the right-hand side matrix operand
8081 , typename ST2 > // Type of the scalar value
8082 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8083 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
8084 {
8085 const ResultType tmp( serial( A * B * scalar ) );
8086 subAssign( C, tmp );
8087 }
8088 //**********************************************************************************************
8089
8090 //**Default subtraction assignment to dense matrices (general/diagonal)*************************
8104 template< typename MT3 // Type of the left-hand side target matrix
8105 , typename MT4 // Type of the left-hand side matrix operand
8106 , typename MT5 // Type of the right-hand side matrix operand
8107 , typename ST2 > // Type of the scalar value
8108 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8109 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
8110 {
8112
8113 const size_t M( A.rows() );
8114 const size_t N( B.columns() );
8115
8116 for( size_t j=0UL; j<N; ++j )
8117 {
8118 const size_t ibegin( ( IsLower_v<MT4> )
8119 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
8120 :( 0UL ) );
8121 const size_t iend( ( IsUpper_v<MT4> )
8122 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
8123 :( M ) );
8124 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
8125
8126 const size_t inum( iend - ibegin );
8127 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
8128 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
8129
8130 for( size_t i=ibegin; i<ipos; i+=2UL ) {
8131 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
8132 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
8133 }
8134 if( ipos < iend ) {
8135 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
8136 }
8137 }
8138 }
8139 //**********************************************************************************************
8140
8141 //**Default subtraction assignment to dense matrices (diagonal/general)*************************
8155 template< typename MT3 // Type of the left-hand side target matrix
8156 , typename MT4 // Type of the left-hand side matrix operand
8157 , typename MT5 // Type of the right-hand side matrix operand
8158 , typename ST2 > // Type of the scalar value
8159 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8160 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
8161 {
8163
8164 const size_t M( A.rows() );
8165 const size_t N( B.columns() );
8166
8167 for( size_t j=0UL; j<N; ++j )
8168 {
8169 const size_t ibegin( ( IsLower_v<MT5> )
8170 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
8171 :( 0UL ) );
8172 const size_t iend( ( IsUpper_v<MT5> )
8173 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
8174 :( M ) );
8175 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
8176
8177 const size_t inum( iend - ibegin );
8178 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
8179 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
8180
8181 for( size_t i=ibegin; i<ipos; i+=2UL ) {
8182 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
8183 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
8184 }
8185 if( ipos < iend ) {
8186 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
8187 }
8188 }
8189 }
8190 //**********************************************************************************************
8191
8192 //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
8206 template< typename MT3 // Type of the left-hand side target matrix
8207 , typename MT4 // Type of the left-hand side matrix operand
8208 , typename MT5 // Type of the right-hand side matrix operand
8209 , typename ST2 > // Type of the scalar value
8210 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8211 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
8212 {
8214
8215 for( size_t i=0UL; i<A.rows(); ++i ) {
8216 C(i,i) -= A(i,i) * B(i,i) * scalar;
8217 }
8218 }
8219 //**********************************************************************************************
8220
8221 //**Default subtraction assignment to dense matrices (small matrices)***************************
8235 template< typename MT3 // Type of the left-hand side target matrix
8236 , typename MT4 // Type of the left-hand side matrix operand
8237 , typename MT5 // Type of the right-hand side matrix operand
8238 , typename ST2 > // Type of the scalar value
8239 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8240 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8241 {
8242 selectDefaultSubAssignKernel( C, A, B, scalar );
8243 }
8244 //**********************************************************************************************
8245
8246 //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
8261 template< typename MT3 // Type of the left-hand side target matrix
8262 , typename MT4 // Type of the left-hand side matrix operand
8263 , typename MT5 // Type of the right-hand side matrix operand
8264 , typename ST2 > // Type of the scalar value
8265 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8266 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8267 {
8272
8273 const ForwardFunctor fwd;
8274
8275 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
8276 const OppositeType_t<MT5> tmp( serial( B ) );
8277 subAssign( C, fwd( A * tmp ) * scalar );
8278 }
8279 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
8280 const OppositeType_t<MT4> tmp( serial( A ) );
8281 subAssign( C, fwd( tmp * B ) * scalar );
8282 }
8283 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
8284 const OppositeType_t<MT5> tmp( serial( B ) );
8285 subAssign( C, fwd( A * tmp ) * scalar );
8286 }
8287 else {
8288 const OppositeType_t<MT4> tmp( serial( A ) );
8289 subAssign( C, fwd( tmp * B ) * scalar );
8290 }
8291 }
8292 //**********************************************************************************************
8293
8294 //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
8309 template< typename MT3 // Type of the left-hand side target matrix
8310 , typename MT4 // Type of the left-hand side matrix operand
8311 , typename MT5 // Type of the right-hand side matrix operand
8312 , typename ST2 > // Type of the scalar value
8313 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8314 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8315 {
8316 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
8317
8318 const size_t M( A.rows() );
8319 const size_t N( B.columns() );
8320 const size_t K( A.columns() );
8321
8322 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
8323
8324 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
8325 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
8326
8327 const SIMDType factor( set( scalar ) );
8328
8329 size_t i( 0UL );
8330
8331 if( IsIntegral_v<ElementType> )
8332 {
8333 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
8334 for( size_t j=0UL; j<N; ++j )
8335 {
8336 const size_t kbegin( ( IsLower_v<MT5> )
8337 ?( ( IsUpper_v<MT4> )
8338 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8339 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8340 :( IsUpper_v<MT4> ? i : 0UL ) );
8341 const size_t kend( ( IsUpper_v<MT5> )
8342 ?( ( IsLower_v<MT4> )
8343 ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
8344 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
8345 :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
8346
8347 size_t k( kbegin );
8348
8349 if( k < kend )
8350 {
8351 SIMDType b1( set( B(k,j) ) );
8352 SIMDType xmm1( A.load(i ,k) * b1 );
8353 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
8354 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
8355 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
8356 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
8357 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,k) * b1 );
8358 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,k) * b1 );
8359 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,k) * b1 );
8360
8361 for( ++k; k<kend; ++k ) {
8362 b1 = set( B(k,j) );
8363 xmm1 += A.load(i ,k) * b1;
8364 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8365 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8366 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8367 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8368 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
8369 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
8370 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
8371 }
8372
8373 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8374 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
8375 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
8376 C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
8377 C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
8378 C.store( i+SIMDSIZE*5UL, j, C.load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
8379 C.store( i+SIMDSIZE*6UL, j, C.load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
8380 C.store( i+SIMDSIZE*7UL, j, C.load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
8381 }
8382 }
8383 }
8384 }
8385
8386 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
8387 {
8388 size_t j( 0UL );
8389
8390 for( ; (j+2UL) <= N; j+=2UL )
8391 {
8392 const size_t kbegin( ( IsLower_v<MT5> )
8393 ?( ( IsUpper_v<MT4> )
8394 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8395 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8396 :( IsUpper_v<MT4> ? i : 0UL ) );
8397 const size_t kend( ( IsUpper_v<MT5> )
8398 ?( ( IsLower_v<MT4> )
8399 ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8400 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8401 :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
8402
8403 size_t k( kbegin );
8404
8405 if( k < kend )
8406 {
8407 SIMDType a1( A.load(i ,k) );
8408 SIMDType a2( A.load(i+SIMDSIZE ,k) );
8409 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8410 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8411 SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
8412 SIMDType b1( set( B(k,j ) ) );
8413 SIMDType b2( set( B(k,j+1UL) ) );
8414 SIMDType xmm1 ( a1 * b1 );
8415 SIMDType xmm2 ( a2 * b1 );
8416 SIMDType xmm3 ( a3 * b1 );
8417 SIMDType xmm4 ( a4 * b1 );
8418 SIMDType xmm5 ( a5 * b1 );
8419 SIMDType xmm6 ( a1 * b2 );
8420 SIMDType xmm7 ( a2 * b2 );
8421 SIMDType xmm8 ( a3 * b2 );
8422 SIMDType xmm9 ( a4 * b2 );
8423 SIMDType xmm10( a5 * b2 );
8424
8425 for( ++k; k<kend; ++k ) {
8426 a1 = A.load(i ,k);
8427 a2 = A.load(i+SIMDSIZE ,k);
8428 a3 = A.load(i+SIMDSIZE*2UL,k);
8429 a4 = A.load(i+SIMDSIZE*3UL,k);
8430 a5 = A.load(i+SIMDSIZE*4UL,k);
8431 b1 = set( B(k,j ) );
8432 b2 = set( B(k,j+1UL) );
8433 xmm1 += a1 * b1;
8434 xmm2 += a2 * b1;
8435 xmm3 += a3 * b1;
8436 xmm4 += a4 * b1;
8437 xmm5 += a5 * b1;
8438 xmm6 += a1 * b2;
8439 xmm7 += a2 * b2;
8440 xmm8 += a3 * b2;
8441 xmm9 += a4 * b2;
8442 xmm10 += a5 * b2;
8443 }
8444
8445 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8446 C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
8447 C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
8448 C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
8449 C.store( i+SIMDSIZE*4UL, j , C.load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
8450 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm6 * factor );
8451 C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
8452 C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
8453 C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
8454 C.store( i+SIMDSIZE*4UL, j+1UL, C.load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
8455 }
8456 }
8457
8458 if( j < N )
8459 {
8460 const size_t kbegin( ( IsLower_v<MT5> )
8461 ?( ( IsUpper_v<MT4> )
8462 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8463 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8464 :( IsUpper_v<MT4> ? i : 0UL ) );
8465 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
8466
8467 size_t k( kbegin );
8468
8469 if( k < kend )
8470 {
8471 SIMDType b1( set( B(k,j) ) );
8472 SIMDType xmm1( A.load(i ,k) * b1 );
8473 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
8474 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
8475 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
8476 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
8477
8478 for( ++k; k<kend; ++k ) {
8479 b1 = set( B(k,j) );
8480 xmm1 += A.load(i ,k) * b1;
8481 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8482 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8483 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8484 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8485 }
8486
8487 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8488 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
8489 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
8490 C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
8491 C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
8492 }
8493 }
8494 }
8495
8496 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
8497 {
8498 size_t j( 0UL );
8499
8500 for( ; (j+2UL) <= N; j+=2UL )
8501 {
8502 const size_t kbegin( ( IsLower_v<MT5> )
8503 ?( ( IsUpper_v<MT4> )
8504 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8505 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8506 :( IsUpper_v<MT4> ? i : 0UL ) );
8507 const size_t kend( ( IsUpper_v<MT5> )
8508 ?( ( IsLower_v<MT4> )
8509 ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8510 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8511 :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
8512
8513 size_t k( kbegin );
8514
8515 if( k < kend )
8516 {
8517 SIMDType a1( A.load(i ,k) );
8518 SIMDType a2( A.load(i+SIMDSIZE ,k) );
8519 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8520 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8521 SIMDType b1( set( B(k,j ) ) );
8522 SIMDType b2( set( B(k,j+1UL) ) );
8523 SIMDType xmm1( a1 * b1 );
8524 SIMDType xmm2( a2 * b1 );
8525 SIMDType xmm3( a3 * b1 );
8526 SIMDType xmm4( a4 * b1 );
8527 SIMDType xmm5( a1 * b2 );
8528 SIMDType xmm6( a2 * b2 );
8529 SIMDType xmm7( a3 * b2 );
8530 SIMDType xmm8( a4 * b2 );
8531
8532 for( ++k; k<kend; ++k ) {
8533 a1 = A.load(i ,k);
8534 a2 = A.load(i+SIMDSIZE ,k);
8535 a3 = A.load(i+SIMDSIZE*2UL,k);
8536 a4 = A.load(i+SIMDSIZE*3UL,k);
8537 b1 = set( B(k,j ) );
8538 b2 = set( B(k,j+1UL) );
8539 xmm1 += a1 * b1;
8540 xmm2 += a2 * b1;
8541 xmm3 += a3 * b1;
8542 xmm4 += a4 * b1;
8543 xmm5 += a1 * b2;
8544 xmm6 += a2 * b2;
8545 xmm7 += a3 * b2;
8546 xmm8 += a4 * b2;
8547 }
8548
8549 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8550 C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
8551 C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
8552 C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
8553 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm5 * factor );
8554 C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
8555 C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
8556 C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
8557 }
8558 }
8559
8560 if( j < N )
8561 {
8562 const size_t kbegin( ( IsLower_v<MT5> )
8563 ?( ( IsUpper_v<MT4> )
8564 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8565 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8566 :( IsUpper_v<MT4> ? i : 0UL ) );
8567 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
8568
8569 size_t k( kbegin );
8570
8571 if( k < kend )
8572 {
8573 SIMDType b1( set( B(k,j) ) );
8574 SIMDType xmm1( A.load(i ,k) * b1 );
8575 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
8576 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
8577 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
8578
8579 for( ++k; k<kend; ++k ) {
8580 b1 = set( B(k,j) );
8581 xmm1 += A.load(i ,k) * b1;
8582 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8583 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8584 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8585 }
8586
8587 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8588 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
8589 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
8590 C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
8591 }
8592 }
8593 }
8594
8595 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
8596 {
8597 size_t j( 0UL );
8598
8599 for( ; (j+2UL) <= N; j+=2UL )
8600 {
8601 const size_t kbegin( ( IsLower_v<MT5> )
8602 ?( ( IsUpper_v<MT4> )
8603 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8604 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8605 :( IsUpper_v<MT4> ? i : 0UL ) );
8606 const size_t kend( ( IsUpper_v<MT5> )
8607 ?( ( IsLower_v<MT4> )
8608 ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8609 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8610 :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
8611
8612 size_t k( kbegin );
8613
8614 if( k < kend )
8615 {
8616 SIMDType a1( A.load(i ,k) );
8617 SIMDType a2( A.load(i+SIMDSIZE ,k) );
8618 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8619 SIMDType b1( set( B(k,j ) ) );
8620 SIMDType b2( set( B(k,j+1UL) ) );
8621 SIMDType xmm1( a1 * b1 );
8622 SIMDType xmm2( a2 * b1 );
8623 SIMDType xmm3( a3 * b1 );
8624 SIMDType xmm4( a1 * b2 );
8625 SIMDType xmm5( a2 * b2 );
8626 SIMDType xmm6( a3 * b2 );
8627
8628 for( ++k; k<kend; ++k ) {
8629 a1 = A.load(i ,k);
8630 a2 = A.load(i+SIMDSIZE ,k);
8631 a3 = A.load(i+SIMDSIZE*2UL,k);
8632 b1 = set( B(k,j ) );
8633 b2 = set( B(k,j+1UL) );
8634 xmm1 += a1 * b1;
8635 xmm2 += a2 * b1;
8636 xmm3 += a3 * b1;
8637 xmm4 += a1 * b2;
8638 xmm5 += a2 * b2;
8639 xmm6 += a3 * b2;
8640 }
8641
8642 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8643 C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
8644 C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
8645 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm4 * factor );
8646 C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
8647 C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
8648 }
8649 }
8650
8651 if( j < N )
8652 {
8653 const size_t kbegin( ( IsLower_v<MT5> )
8654 ?( ( IsUpper_v<MT4> )
8655 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8656 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8657 :( IsUpper_v<MT4> ? i : 0UL ) );
8658 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
8659
8660 size_t k( kbegin );
8661
8662 if( k < kend )
8663 {
8664 SIMDType b1( set( B(k,j) ) );
8665 SIMDType xmm1( A.load(i ,k) * b1 );
8666 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
8667 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
8668
8669 for( ++k; k<kend; ++k ) {
8670 b1 = set( B(k,j) );
8671 xmm1 += A.load(i ,k) * b1;
8672 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8673 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8674 }
8675
8676 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8677 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
8678 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
8679 }
8680 }
8681 }
8682
8683 for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
8684 {
8685 const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
8686 size_t j( UPP ? i : 0UL );
8687
8688 for( ; (j+4UL) <= jend; j+=4UL )
8689 {
8690 const size_t kbegin( ( IsLower_v<MT5> )
8691 ?( ( IsUpper_v<MT4> )
8692 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8693 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8694 :( IsUpper_v<MT4> ? i : 0UL ) );
8695 const size_t kend( ( IsUpper_v<MT5> )
8696 ?( ( IsLower_v<MT4> )
8697 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
8698 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
8699 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
8700
8701 size_t k( kbegin );
8702
8703 if( k < kend )
8704 {
8705 SIMDType a1( A.load(i ,k) );
8706 SIMDType a2( A.load(i+SIMDSIZE,k) );
8707 SIMDType b1( set( B(k,j ) ) );
8708 SIMDType b2( set( B(k,j+1UL) ) );
8709 SIMDType b3( set( B(k,j+2UL) ) );
8710 SIMDType b4( set( B(k,j+3UL) ) );
8711 SIMDType xmm1( a1 * b1 );
8712 SIMDType xmm2( a2 * b1 );
8713 SIMDType xmm3( a1 * b2 );
8714 SIMDType xmm4( a2 * b2 );
8715 SIMDType xmm5( a1 * b3 );
8716 SIMDType xmm6( a2 * b3 );
8717 SIMDType xmm7( a1 * b4 );
8718 SIMDType xmm8( a2 * b4 );
8719
8720 for( ++k; k<kend; ++k ) {
8721 a1 = A.load(i ,k);
8722 a2 = A.load(i+SIMDSIZE,k);
8723 b1 = set( B(k,j ) );
8724 b2 = set( B(k,j+1UL) );
8725 b3 = set( B(k,j+2UL) );
8726 b4 = set( B(k,j+3UL) );
8727 xmm1 += a1 * b1;
8728 xmm2 += a2 * b1;
8729 xmm3 += a1 * b2;
8730 xmm4 += a2 * b2;
8731 xmm5 += a1 * b3;
8732 xmm6 += a2 * b3;
8733 xmm7 += a1 * b4;
8734 xmm8 += a2 * b4;
8735 }
8736
8737 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8738 C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
8739 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
8740 C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
8741 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
8742 C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
8743 C.store( i , j+3UL, C.load(i ,j+3UL) - xmm7 * factor );
8744 C.store( i+SIMDSIZE, j+3UL, C.load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
8745 }
8746 }
8747
8748 for( ; (j+3UL) <= jend; j+=3UL )
8749 {
8750 const size_t kbegin( ( IsLower_v<MT5> )
8751 ?( ( IsUpper_v<MT4> )
8752 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8753 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8754 :( IsUpper_v<MT4> ? i : 0UL ) );
8755 const size_t kend( ( IsUpper_v<MT5> )
8756 ?( ( IsLower_v<MT4> )
8757 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
8758 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
8759 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
8760
8761 size_t k( kbegin );
8762
8763 if( k < kend )
8764 {
8765 SIMDType a1( A.load(i ,k) );
8766 SIMDType a2( A.load(i+SIMDSIZE,k) );
8767 SIMDType b1( set( B(k,j ) ) );
8768 SIMDType b2( set( B(k,j+1UL) ) );
8769 SIMDType b3( set( B(k,j+2UL) ) );
8770 SIMDType xmm1( a1 * b1 );
8771 SIMDType xmm2( a2 * b1 );
8772 SIMDType xmm3( a1 * b2 );
8773 SIMDType xmm4( a2 * b2 );
8774 SIMDType xmm5( a1 * b3 );
8775 SIMDType xmm6( a2 * b3 );
8776
8777 for( ++k; k<kend; ++k ) {
8778 a1 = A.load(i ,k);
8779 a2 = A.load(i+SIMDSIZE,k);
8780 b1 = set( B(k,j ) );
8781 b2 = set( B(k,j+1UL) );
8782 b3 = set( B(k,j+2UL) );
8783 xmm1 += a1 * b1;
8784 xmm2 += a2 * b1;
8785 xmm3 += a1 * b2;
8786 xmm4 += a2 * b2;
8787 xmm5 += a1 * b3;
8788 xmm6 += a2 * b3;
8789 }
8790
8791 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8792 C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
8793 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
8794 C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
8795 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
8796 C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
8797 }
8798 }
8799
8800 for( ; (j+2UL) <= jend; j+=2UL )
8801 {
8802 const size_t kbegin( ( IsLower_v<MT5> )
8803 ?( ( IsUpper_v<MT4> )
8804 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8805 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8806 :( IsUpper_v<MT4> ? i : 0UL ) );
8807 const size_t kend( ( IsUpper_v<MT5> )
8808 ?( ( IsLower_v<MT4> )
8809 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8810 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8811 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
8812
8813 size_t k( kbegin );
8814
8815 if( k < kend )
8816 {
8817 SIMDType a1( A.load(i ,k) );
8818 SIMDType a2( A.load(i+SIMDSIZE,k) );
8819 SIMDType b1( set( B(k,j ) ) );
8820 SIMDType b2( set( B(k,j+1UL) ) );
8821 SIMDType xmm1( a1 * b1 );
8822 SIMDType xmm2( a2 * b1 );
8823 SIMDType xmm3( a1 * b2 );
8824 SIMDType xmm4( a2 * b2 );
8825
8826 for( ++k; k<kend; ++k ) {
8827 a1 = A.load(i ,k);
8828 a2 = A.load(i+SIMDSIZE,k);
8829 b1 = set( B(k,j ) );
8830 b2 = set( B(k,j+1UL) );
8831 xmm1 += a1 * b1;
8832 xmm2 += a2 * b1;
8833 xmm3 += a1 * b2;
8834 xmm4 += a2 * b2;
8835 }
8836
8837 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8838 C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
8839 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
8840 C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
8841 }
8842 }
8843
8844 if( j < jend )
8845 {
8846 const size_t kbegin( ( IsLower_v<MT5> )
8847 ?( ( IsUpper_v<MT4> )
8848 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8849 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8850 :( IsUpper_v<MT4> ? i : 0UL ) );
8851 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
8852
8853 size_t k( kbegin );
8854
8855 if( k < kend )
8856 {
8857 SIMDType b1( set( B(k,j) ) );
8858 SIMDType xmm1( A.load(i ,k) * b1 );
8859 SIMDType xmm2( A.load(i+SIMDSIZE,k) * b1 );
8860
8861 for( ++k; k<kend; ++k ) {
8862 b1 = set( B(k,j) );
8863 xmm1 += A.load(i ,k) * b1;
8864 xmm2 += A.load(i+SIMDSIZE,k) * b1;
8865 }
8866
8867 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8868 C.store( i+SIMDSIZE, j, C.load(i+SIMDSIZE,j) - xmm2 * factor );
8869 }
8870 }
8871 }
8872
8873 for( ; i<ipos; i+=SIMDSIZE )
8874 {
8875 const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
8876 size_t j( UPP ? i : 0UL );
8877
8878 for( ; (j+4UL) <= jend; j+=4UL )
8879 {
8880 const size_t kbegin( ( IsLower_v<MT5> )
8881 ?( ( IsUpper_v<MT4> )
8882 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8883 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8884 :( IsUpper_v<MT4> ? i : 0UL ) );
8885 const size_t kend( ( IsUpper_v<MT5> )
8886 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
8887 :( K ) );
8888
8889 size_t k( kbegin );
8890
8891 if( k < kend )
8892 {
8893 SIMDType a1( A.load(i,k) );
8894 SIMDType xmm1( a1 * set( B(k,j ) ) );
8895 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
8896 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
8897 SIMDType xmm4( a1 * set( B(k,j+3UL) ) );
8898
8899 for( ++k; k<kend; ++k ) {
8900 a1 = A.load(i,k);
8901 xmm1 += a1 * set( B(k,j ) );
8902 xmm2 += a1 * set( B(k,j+1UL) );
8903 xmm3 += a1 * set( B(k,j+2UL) );
8904 xmm4 += a1 * set( B(k,j+3UL) );
8905 }
8906
8907 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8908 C.store( i, j+1UL, C.load(i,j+1UL) - xmm2 * factor );
8909 C.store( i, j+2UL, C.load(i,j+2UL) - xmm3 * factor );
8910 C.store( i, j+3UL, C.load(i,j+3UL) - xmm4 * factor );
8911 }
8912 }
8913
8914 for( ; (j+3UL) <= jend; j+=3UL )
8915 {
8916 const size_t kbegin( ( IsLower_v<MT5> )
8917 ?( ( IsUpper_v<MT4> )
8918 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8919 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8920 :( IsUpper_v<MT4> ? i : 0UL ) );
8921 const size_t kend( ( IsUpper_v<MT5> )
8922 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
8923 :( K ) );
8924
8925 size_t k( kbegin );
8926
8927 if( k < kend )
8928 {
8929 SIMDType a1( A.load(i,k) );
8930 SIMDType xmm1( a1 * set( B(k,j ) ) );
8931 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
8932 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
8933
8934 for( ++k; k<kend; ++k ) {
8935 a1 = A.load(i,k);
8936 xmm1 += a1 * set( B(k,j ) );
8937 xmm2 += a1 * set( B(k,j+1UL) );
8938 xmm3 += a1 * set( B(k,j+2UL) );
8939 }
8940
8941 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8942 C.store( i, j+1UL, C.load(i,j+1UL) - xmm2 * factor );
8943 C.store( i, j+2UL, C.load(i,j+2UL) - xmm3 * factor );
8944 }
8945 }
8946
8947 for( ; (j+2UL) <= jend; j+=2UL )
8948 {
8949 const size_t kbegin( ( IsLower_v<MT5> )
8950 ?( ( IsUpper_v<MT4> )
8951 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8952 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8953 :( IsUpper_v<MT4> ? i : 0UL ) );
8954 const size_t kend( ( IsUpper_v<MT5> )
8955 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
8956 :( K ) );
8957
8958 size_t k( kbegin );
8959
8960 if( k < kend )
8961 {
8962 SIMDType a1( A.load(i,k) );
8963 SIMDType xmm1( a1 * set( B(k,j ) ) );
8964 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
8965
8966 for( ++k; k<kend; ++k ) {
8967 a1 = A.load(i,k);
8968 xmm1 += a1 * set( B(k,j ) );
8969 xmm2 += a1 * set( B(k,j+1UL) );
8970 }
8971
8972 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8973 C.store( i, j+1UL, C.load(i,j+1UL) - xmm2 * factor );
8974 }
8975 }
8976
8977 if( j < jend )
8978 {
8979 const size_t kbegin( ( IsLower_v<MT5> )
8980 ?( ( IsUpper_v<MT4> )
8981 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8982 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8983 :( IsUpper_v<MT4> ? i : 0UL ) );
8984
8985 size_t k( kbegin );
8986
8987 if( k < K )
8988 {
8989 SIMDType xmm1( A.load(i,k) * set( B(k,j) ) );
8990
8991 for( ++k; k<K; ++k ) {
8992 xmm1 += A.load(i,k) * set( B(k,j) );
8993 }
8994
8995 C.store( i, j, C.load(i,j) - xmm1 * factor );
8996 }
8997 }
8998 }
8999
9000 for( ; remainder && i<M; ++i )
9001 {
9002 const size_t jend( LOW ? i+1UL : N );
9003 size_t j( UPP ? i : 0UL );
9004
9005 for( ; (j+2UL) <= jend; j+=2UL )
9006 {
9007 const size_t kbegin( ( IsLower_v<MT5> )
9008 ?( ( IsUpper_v<MT4> )
9009 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9010 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9011 :( IsUpper_v<MT4> ? i : 0UL ) );
9012 const size_t kend( ( IsUpper_v<MT5> )
9013 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
9014 :( K ) );
9015
9016 size_t k( kbegin );
9017
9018 if( k < kend )
9019 {
9020 ElementType value1( A(i,k) * B(k,j ) );
9021 ElementType value2( A(i,k) * B(k,j+1UL) );
9022
9023 for( ++k; k<kend; ++k ) {
9024 value1 += A(i,k) * B(k,j );
9025 value2 += A(i,k) * B(k,j+1UL);
9026 }
9027
9028 C(i,j ) -= value1 * scalar;
9029 C(i,j+1UL) -= value2 * scalar;
9030 }
9031 }
9032
9033 if( j < jend )
9034 {
9035 const size_t kbegin( ( IsLower_v<MT5> )
9036 ?( ( IsUpper_v<MT4> )
9037 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9038 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9039 :( IsUpper_v<MT4> ? i : 0UL ) );
9040
9041 size_t k( kbegin );
9042
9043 if( k < K )
9044 {
9045 ElementType value( A(i,k) * B(k,j) );
9046
9047 for( ++k; k<K; ++k ) {
9048 value += A(i,k) * B(k,j);
9049 }
9050
9051 C(i,j) -= value * scalar;
9052 }
9053 }
9054 }
9055 }
9056 //**********************************************************************************************
9057
9058 //**Default subtraction assignment to dense matrices (large matrices)***************************
9072 template< typename MT3 // Type of the left-hand side target matrix
9073 , typename MT4 // Type of the left-hand side matrix operand
9074 , typename MT5 // Type of the right-hand side matrix operand
9075 , typename ST2 > // Type of the scalar value
9076 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9077 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9078 {
9079 selectDefaultSubAssignKernel( C, A, B, scalar );
9080 }
9081 //**********************************************************************************************
9082
9083 //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
9098 template< typename MT3 // Type of the left-hand side target matrix
9099 , typename MT4 // Type of the left-hand side matrix operand
9100 , typename MT5 // Type of the right-hand side matrix operand
9101 , typename ST2 > // Type of the scalar value
9102 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9103 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9104 {
9105 if( LOW )
9106 lmmm( C, A, B, -scalar, ST2(1) );
9107 else if( UPP )
9108 ummm( C, A, B, -scalar, ST2(1) );
9109 else
9110 mmm( C, A, B, -scalar, ST2(1) );
9111 }
9112 //**********************************************************************************************
9113
9114 //**BLAS-based subtraction assignment to dense matrices (default)*******************************
9129 template< typename MT3 // Type of the left-hand side target matrix
9130 , typename MT4 // Type of the left-hand side matrix operand
9131 , typename MT5 // Type of the right-hand side matrix operand
9132 , typename ST2 > // Type of the scalar value
9133 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9134 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9135 {
9136 selectLargeSubAssignKernel( C, A, B, scalar );
9137 }
9138 //**********************************************************************************************
9139
9140 //**BLAS-based subraction assignment to dense matrices******************************************
9141#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
9155 template< typename MT3 // Type of the left-hand side target matrix
9156 , typename MT4 // Type of the left-hand side matrix operand
9157 , typename MT5 // Type of the right-hand side matrix operand
9158 , typename ST2 > // Type of the scalar value
9159 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9160 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9161 {
9162 using ET = ElementType_t<MT3>;
9163
9164 if( IsTriangular_v<MT4> ) {
9165 ResultType_t<MT3> tmp( serial( B ) );
9166 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
9167 subAssign( C, tmp );
9168 }
9169 else if( IsTriangular_v<MT5> ) {
9170 ResultType_t<MT3> tmp( serial( A ) );
9171 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
9172 subAssign( C, tmp );
9173 }
9174 else {
9175 gemm( C, A, B, ET(-scalar), ET(1) );
9176 }
9177 }
9178#endif
9179 //**********************************************************************************************
9180
9181 //**Restructuring subtraction assignment to row-major matrices**********************************
9195 template< typename MT > // Type of the target matrix
9196 friend inline auto subAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
9197 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9198 {
9200
9202
9203 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9204 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9205
9206 const ForwardFunctor fwd;
9207
9208 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9209 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9210
9211 subAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
9212 }
9213 //**********************************************************************************************
9214
9215 //**Subtraction assignment to sparse matrices***************************************************
9216 // No special implementation for the subtraction assignment to sparse matrices.
9217 //**********************************************************************************************
9218
9219 //**Schur product assignment to dense matrices**************************************************
9231 template< typename MT // Type of the target dense matrix
9232 , bool SO > // Storage order of the target dense matrix
9233 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9234 {
9236
9240
9241 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9242 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9243
9244 const ResultType tmp( serial( rhs ) );
9245 schurAssign( *lhs, tmp );
9246 }
9247 //**********************************************************************************************
9248
9249 //**Schur product assignment to sparse matrices*************************************************
9250 // No special implementation for the Schur product assignment to sparse matrices.
9251 //**********************************************************************************************
9252
9253 //**Multiplication assignment to dense matrices*************************************************
9254 // No special implementation for the multiplication assignment to dense matrices.
9255 //**********************************************************************************************
9256
9257 //**Multiplication assignment to sparse matrices************************************************
9258 // No special implementation for the multiplication assignment to sparse matrices.
9259 //**********************************************************************************************
9260
9261 //**SMP assignment to dense matrices************************************************************
9276 template< typename MT // Type of the target dense matrix
9277 , bool SO > // Storage order of the target dense matrix
9278 friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9279 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9280 {
9282
9283 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9284 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9285
9286 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9287 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9288
9289 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
9290 return;
9291 }
9292 else if( left.columns() == 0UL ) {
9293 reset( *lhs );
9294 return;
9295 }
9296
9297 LT A( left ); // Evaluation of the left-hand side dense matrix operand
9298 RT B( right ); // Evaluation of the right-hand side dense matrix operand
9299
9300 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9301 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9302 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9303 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9304 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
9305 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
9306
9307 smpAssign( *lhs, A * B * rhs.scalar_ );
9308 }
9309 //**********************************************************************************************
9310
9311 //**SMP assignment to sparse matrices***********************************************************
9326 template< typename MT // Type of the target sparse matrix
9327 , bool SO > // Storage order of the target sparse matrix
9328 friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9329 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9330 {
9332
9333 using TmpType = If_t< SO, ResultType, OppositeType >;
9334
9341
9342 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9343 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9344
9345 const ForwardFunctor fwd;
9346
9347 const TmpType tmp( rhs );
9348 smpAssign( *lhs, fwd( tmp ) );
9349 }
9350 //**********************************************************************************************
9351
9352 //**Restructuring SMP assignment to row-major matrices******************************************
9366 template< typename MT > // Type of the target matrix
9367 friend inline auto smpAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
9368 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9369 {
9371
9373
9374 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9375 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9376
9377 const ForwardFunctor fwd;
9378
9379 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9380 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9381
9382 smpAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
9383 }
9384 //**********************************************************************************************
9385
9386 //**SMP addition assignment to dense matrices***************************************************
9401 template< typename MT // Type of the target dense matrix
9402 , bool SO > // Storage order of the target dense matrix
9403 friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9404 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9405 {
9407
9408 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9409 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9410
9411 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9412 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9413
9414 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
9415 return;
9416 }
9417
9418 LT A( left ); // Evaluation of the left-hand side dense matrix operand
9419 RT B( right ); // Evaluation of the right-hand side dense matrix operand
9420
9421 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9422 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9423 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9424 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9425 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
9426 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
9427
9428 smpAddAssign( *lhs, A * B * rhs.scalar_ );
9429 }
9430 //**********************************************************************************************
9431
9432 //**Restructuring SMP addition assignment to row-major matrices*********************************
9447 template< typename MT > // Type of the target matrix
9448 friend inline auto smpAddAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
9449 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9450 {
9452
9454
9455 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9456 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9457
9458 const ForwardFunctor fwd;
9459
9460 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9461 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9462
9463 smpAddAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
9464 }
9465 //**********************************************************************************************
9466
9467 //**SMP addition assignment to sparse matrices**************************************************
9468 // No special implementation for the SMP addition assignment to sparse matrices.
9469 //**********************************************************************************************
9470
9471 //**SMP subtraction assignment to dense matrices************************************************
9486 template< typename MT // Type of the target dense matrix
9487 , bool SO > // Storage order of the target dense matrix
9488 friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9489 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9490 {
9492
9493 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9494 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9495
9496 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9497 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9498
9499 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
9500 return;
9501 }
9502
9503 LT A( left ); // Evaluation of the left-hand side dense matrix operand
9504 RT B( right ); // Evaluation of the right-hand side dense matrix operand
9505
9506 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9507 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9508 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9509 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9510 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
9511 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
9512
9513 smpSubAssign( *lhs, A * B * rhs.scalar_ );
9514 }
9515 //**********************************************************************************************
9516
9517 //**Restructuring SMP subtraction assignment to row-major matrices******************************
9532 template< typename MT > // Type of the target matrix
9533 friend inline auto smpSubAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
9534 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9535 {
9537
9539
9540 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9541 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9542
9543 const ForwardFunctor fwd;
9544
9545 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9546 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9547
9548 smpSubAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
9549 }
9550 //**********************************************************************************************
9551
9552 //**SMP subtraction assignment to sparse matrices***********************************************
9553 // No special implementation for the SMP subtraction assignment to sparse matrices.
9554 //**********************************************************************************************
9555
9556 //**SMP Schur product assignment to dense matrices**********************************************
9568 template< typename MT // Type of the target dense matrix
9569 , bool SO > // Storage order of the target dense matrix
9570 friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9571 {
9573
9577
9578 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9579 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9580
9581 const ResultType tmp( rhs );
9582 smpSchurAssign( *lhs, tmp );
9583 }
9584 //**********************************************************************************************
9585
9586 //**SMP Schur product assignment to sparse matrices*********************************************
9587 // No special implementation for the SMP Schur product assignment to sparse matrices.
9588 //**********************************************************************************************
9589
9590 //**SMP multiplication assignment to dense matrices*********************************************
9591 // No special implementation for the SMP multiplication assignment to dense matrices.
9592 //**********************************************************************************************
9593
9594 //**SMP multiplication assignment to sparse matrices********************************************
9595 // No special implementation for the SMP multiplication assignment to sparse matrices.
9596 //**********************************************************************************************
9597
9598 //**Compile time checks*************************************************************************
9607 //**********************************************************************************************
9608};
9610//*************************************************************************************************
9611
9612
9613
9614
9615//=================================================================================================
9616//
9617// GLOBAL BINARY ARITHMETIC OPERATORS
9618//
9619//=================================================================================================
9620
9621//*************************************************************************************************
9648template< typename MT1 // Type of the left-hand side dense matrix
9649 , typename MT2 > // Type of the right-hand side dense matrix
9650inline decltype(auto)
9651 operator*( const DenseMatrix<MT1,true>& lhs, const DenseMatrix<MT2,true>& rhs )
9652{
9654
9655 if( (*lhs).columns() != (*rhs).rows() ) {
9656 BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
9657 }
9658
9660 return ReturnType( *lhs, *rhs );
9661}
9662//*************************************************************************************************
9663
9664
9665
9666
9667//=================================================================================================
9668//
9669// GLOBAL FUNCTIONS
9670//
9671//=================================================================================================
9672
9673//*************************************************************************************************
9696template< typename MT1 // Type of the left-hand side dense matrix
9697 , typename MT2 // Type of the right-hand side dense matrix
9698 , bool SF // Symmetry flag
9699 , bool HF // Hermitian flag
9700 , bool LF // Lower flag
9701 , bool UF > // Upper flag
9702inline decltype(auto) declsym( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9703{
9705
9706 if( !isSquare( dm ) ) {
9707 BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
9708 }
9709
9710 using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
9711 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9712}
9714//*************************************************************************************************
9715
9716
9717//*************************************************************************************************
9740template< typename MT1 // Type of the left-hand side dense matrix
9741 , typename MT2 // Type of the right-hand side dense matrix
9742 , bool SF // Symmetry flag
9743 , bool HF // Hermitian flag
9744 , bool LF // Lower flag
9745 , bool UF > // Upper flag
9746inline decltype(auto) declherm( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9747{
9749
9750 if( !isSquare( dm ) ) {
9751 BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
9752 }
9753
9754 using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
9755 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9756}
9758//*************************************************************************************************
9759
9760
9761//*************************************************************************************************
9784template< typename MT1 // Type of the left-hand side dense matrix
9785 , typename MT2 // Type of the right-hand side dense matrix
9786 , bool SF // Symmetry flag
9787 , bool HF // Hermitian flag
9788 , bool LF // Lower flag
9789 , bool UF > // Upper flag
9790inline decltype(auto) decllow( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9791{
9793
9794 if( !isSquare( dm ) ) {
9795 BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9796 }
9797
9798 using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
9799 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9800}
9802//*************************************************************************************************
9803
9804
9805//*************************************************************************************************
9828template< typename MT1 // Type of the left-hand side dense matrix
9829 , typename MT2 // Type of the right-hand side dense matrix
9830 , bool SF // Symmetry flag
9831 , bool HF // Hermitian flag
9832 , bool UF > // Upper flag
9833inline decltype(auto) declunilow( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
9834{
9836
9837 if( !isSquare( dm ) ) {
9838 BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9839 }
9840
9841 return declunilow( decllow( *dm ) );
9842}
9844//*************************************************************************************************
9845
9846
9847//*************************************************************************************************
9870template< typename MT1 // Type of the left-hand side dense matrix
9871 , typename MT2 // Type of the right-hand side dense matrix
9872 , bool SF // Symmetry flag
9873 , bool HF // Hermitian flag
9874 , bool UF > // Upper flag
9875inline decltype(auto) declstrlow( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
9876{
9878
9879 if( !isSquare( dm ) ) {
9880 BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9881 }
9882
9883 return declstrlow( decllow( *dm ) );
9884}
9886//*************************************************************************************************
9887
9888
9889//*************************************************************************************************
9912template< typename MT1 // Type of the left-hand side dense matrix
9913 , typename MT2 // Type of the right-hand side dense matrix
9914 , bool SF // Symmetry flag
9915 , bool HF // Hermitian flag
9916 , bool LF // Lower flag
9917 , bool UF > // Upper flag
9918inline decltype(auto) declupp( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9919{
9921
9922 if( !isSquare( dm ) ) {
9923 BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
9924 }
9925
9926 using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
9927 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9928}
9930//*************************************************************************************************
9931
9932
9933//*************************************************************************************************
9956template< typename MT1 // Type of the left-hand side dense matrix
9957 , typename MT2 // Type of the right-hand side dense matrix
9958 , bool SF // Symmetry flag
9959 , bool HF // Hermitian flag
9960 , bool LF > // Lower flag
9961inline decltype(auto) decluniupp( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
9962{
9964
9965 if( !isSquare( dm ) ) {
9966 BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
9967 }
9968
9969 return decluniupp( declupp( *dm ) );
9970}
9972//*************************************************************************************************
9973
9974
9975//*************************************************************************************************
9998template< typename MT1 // Type of the left-hand side dense matrix
9999 , typename MT2 // Type of the right-hand side dense matrix
10000 , bool SF // Symmetry flag
10001 , bool HF // Hermitian flag
10002 , bool LF > // Lower flag
10003inline decltype(auto) declstrupp( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
10004{
10006
10007 if( !isSquare( dm ) ) {
10008 BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
10009 }
10010
10011 return declstrupp( declupp( *dm ) );
10012}
10014//*************************************************************************************************
10015
10016
10017//*************************************************************************************************
10040template< typename MT1 // Type of the left-hand side dense matrix
10041 , typename MT2 // Type of the right-hand side dense matrix
10042 , bool SF // Symmetry flag
10043 , bool HF // Hermitian flag
10044 , bool LF // Lower flag
10045 , bool UF > // Upper flag
10046inline decltype(auto) decldiag( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
10047{
10049
10050 if( !isSquare( dm ) ) {
10051 BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
10052 }
10053
10054 using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,SF,HF,true,true>;
10055 return ReturnType( dm.leftOperand(), dm.rightOperand() );
10056}
10058//*************************************************************************************************
10059
10060
10061
10062
10063//=================================================================================================
10064//
10065// SIZE SPECIALIZATIONS
10066//
10067//=================================================================================================
10068
10069//*************************************************************************************************
10071template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10072struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
10073 : public Size<MT1,0UL>
10074{};
10075
10076template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10077struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
10078 : public Size<MT2,1UL>
10079{};
10081//*************************************************************************************************
10082
10083
10084
10085
10086//=================================================================================================
10087//
10088// ISALIGNED SPECIALIZATIONS
10089//
10090//=================================================================================================
10091
10092//*************************************************************************************************
10094template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10095struct IsAligned< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10096 : public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
10097{};
10099//*************************************************************************************************
10100
10101} // namespace blaze
10102
10103#endif
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.
Definition: Aliases.h:310
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for kernel specific block sizes.
Header file for the blaze::checked and blaze::unchecked instances.
Constraints on the storage order of matrix types.
Header file for the complex data type.
Header file for the conjugate shim.
Header file for the decldiag trait.
Header file for the DeclDiag functor.
Header file for the declherm trait.
Header file for the DeclHerm functor.
Header file for the decllow trait.
Header file for the DeclLow functor.
Header file for the declsym trait.
Header file for the DeclSym functor.
Header file for the declupp trait.
Header file for the DeclUpp functor.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsColumnMajorMatrix type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsIntegral type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsResizable type trait.
Header file for the IsRowMajorMatrix type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Header file for the dense matrix multiplication kernels.
Header file for the multiplication trait.
Header file for the Noop functor.
Header file for the prevMultiple shim.
Constraints on the storage order of matrix types.
Header file for all SIMD functionality.
Data type constraint.
Constraint on the data type.
Constraint on the data type.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:592
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:548
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:170
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:108
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:602
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:167
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:176
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:159
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:570
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:538
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
MatScalarMultExpr< DenseMatrix< This, SO > > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:162
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:179
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:173
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:611
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:427
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:558
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:437
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:446
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:582
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:528
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:459
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:166
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:610
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:432
Base class for dense matrices.
Definition: DenseMatrix.h:82
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Expression object for transpose dense matrix-transpose dense matrix multiplications.
Definition: TDMatTDMatMultExpr.h:152
static constexpr bool LOW
Flag for lower matrices.
Definition: TDMatTDMatMultExpr.h:176
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:463
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:409
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:296
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:419
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatTDMatMultExpr.h:320
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:329
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:160
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:393
static constexpr bool UPP
Flag for upper matrices.
Definition: TDMatTDMatMultExpr.h:177
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:344
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:155
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:286
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:289
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:473
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatTDMatMultExpr.h:314
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:495
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:285
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:165
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:170
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:451
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:156
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:429
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:288
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:302
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:287
static constexpr bool SYM
Flag for symmetric matrices.
Definition: TDMatTDMatMultExpr.h:174
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:158
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:299
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: TDMatTDMatMultExpr.h:175
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:157
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:439
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatTDMatMultExpr.h:307
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:483
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:496
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:290
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:293
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:283
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:159
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseMatrix base class.
Header file for the MatMatMultExpr base class.
Header file for the MatScalarMultExpr base class.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) transIf(const DenseMatrix< MT, SO > &dm)
Conditional calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:832
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1339
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1375
decltype(auto) declstrupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly upper.
Definition: DMatDeclStrUppExpr.h:1003
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1464
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:978
decltype(auto) declstrlow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly lower.
Definition: DMatDeclStrLowExpr.h:1003
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1004
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1004
decltype(auto) decluniupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as uniupper.
Definition: DMatDeclUniUppExpr.h:1005
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1005
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1005
decltype(auto) declunilow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as unilower.
Definition: DMatDeclUniLowExpr.h:1004
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: RowMajorMatrix.h:61
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.
Definition: StorageOrder.h:84
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.
Definition: MatMatMultExpr.h:103
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: ColumnMajorMatrix.h:61
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:584
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:1383
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:137
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
typename EnableIf<!Condition, T >::Type DisableIf_t
Auxiliary type for the EnableIf class template.
Definition: EnableIf.h:175
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
constexpr decltype(auto) zero(size_t m, size_t n) noexcept
Creating a zero matrix.
Definition: ZeroMatrix.h:1356
Header file for the exception macros of the math module.
Constraints on the storage order of matrix types.
Header file for all forward declarations for expression class templates.
Header file for the Size type trait.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:127
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:61
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:126
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:61
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:126
Generic wrapper for the decllow() function.
Definition: DeclLow.h:61
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:126
Generic wrapper for the declsym() function.
Definition: DeclSym.h:61
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:126
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:61
Base class for all matrix/matrix multiplication expression templates.
Definition: MatMatMultExpr.h:71
Base template for the MultTrait class.
Definition: MultTrait.h:130
Generic wrapper for the null function.
Definition: Noop.h:62
System settings for the BLAS mode.
System settings for the debugging policy of the Blaze library.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.
Header file for the generic max algorithm.
Header file for the generic min algorithm.