Blaze 3.9
DMatDMatMultExpr.h
Go to the documentation of this file.
1//=================================================================================================
33//=================================================================================================
34
35#ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
37
38
39//*************************************************************************************************
40// Includes
41//*************************************************************************************************
42
45#include <blaze/math/Aliases.h>
73#include <blaze/math/SIMD.h>
105#include <blaze/system/BLAS.h>
112#include <blaze/util/Assert.h>
113#include <blaze/util/Complex.h>
115#include <blaze/util/EnableIf.h>
118#include <blaze/util/mpl/If.h>
119#include <blaze/util/Types.h>
128
129
130namespace blaze {
131
132//=================================================================================================
133//
134// CLASS DMATDMATMULTEXPR
135//
136//=================================================================================================
137
138//*************************************************************************************************
145template< typename MT1 // Type of the left-hand side dense matrix
146 , typename MT2 // Type of the right-hand side dense matrix
147 , bool SF // Symmetry flag
148 , bool HF // Hermitian flag
149 , bool LF // Lower flag
150 , bool UF > // Upper flag
152 : public MatMatMultExpr< DenseMatrix< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
153 , private Computation
154{
155 private:
156 //**Type definitions****************************************************************************
163 //**********************************************************************************************
164
165 //**********************************************************************************************
167 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
168 //**********************************************************************************************
169
170 //**********************************************************************************************
172 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
173 //**********************************************************************************************
174
175 //**********************************************************************************************
176 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
177 static constexpr bool HERM = ( HF && !( LF || UF ) );
178 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
179 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
180 //**********************************************************************************************
181
182 //**********************************************************************************************
184
189 template< typename T1, typename T2, typename T3 >
190 static constexpr bool CanExploitSymmetry_v =
191 ( IsColumnMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
193 //**********************************************************************************************
194
195 //**********************************************************************************************
197
201 template< typename T1, typename T2, typename T3 >
202 static constexpr bool IsEvaluationRequired_v =
203 ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
205 //**********************************************************************************************
206
207 //**********************************************************************************************
209
212 template< typename T1, typename T2, typename T3 >
213 static constexpr bool UseBlasKernel_v =
214 ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
215 !SYM && !HERM && !LOW && !UPP &&
216 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
217 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
218 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
219 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
220 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
221 IsBLASCompatible_v< ElementType_t<T1> > &&
222 IsBLASCompatible_v< ElementType_t<T2> > &&
223 IsBLASCompatible_v< ElementType_t<T3> > &&
224 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
225 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
227 //**********************************************************************************************
228
229 //**********************************************************************************************
231
234 template< typename T1, typename T2, typename T3 >
235 static constexpr bool UseVectorizedDefaultKernel_v =
236 ( useOptimizedKernels &&
237 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
238 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
239 IsSIMDCombinable_v< ElementType_t<T1>
241 , ElementType_t<T3> > &&
242 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
243 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
245 //**********************************************************************************************
246
247 //**********************************************************************************************
249
252 using ForwardFunctor = If_t< HERM
253 , DeclHerm
254 , If_t< SYM
255 , DeclSym
256 , If_t< LOW
257 , If_t< UPP
258 , DeclDiag
259 , DeclLow >
260 , If_t< UPP
261 , DeclUpp
262 , Noop > > > >;
264 //**********************************************************************************************
265
266 public:
267 //**Type definitions****************************************************************************
270
273
275 using ResultType = typename If_t< HERM
277 , If_t< SYM
279 , If_t< LOW
280 , If_t< UPP
283 , If_t< UPP
285 , MultTrait<RT1,RT2> > > > >::Type;
286
291 using ReturnType = const ElementType;
292 using CompositeType = const ResultType;
293
295 using LeftOperand = If_t< IsExpression_v<MT1>, const MT1, const MT1& >;
296
298 using RightOperand = If_t< IsExpression_v<MT2>, const MT2, const MT2& >;
299
302
305 //**********************************************************************************************
306
307 //**Compilation flags***************************************************************************
309 static constexpr bool simdEnabled =
310 ( !IsDiagonal_v<MT2> &&
311 MT1::simdEnabled && MT2::simdEnabled &&
312 HasSIMDAdd_v<ET1,ET2> &&
313 HasSIMDMult_v<ET1,ET2> );
314
316 static constexpr bool smpAssignable =
317 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
318 //**********************************************************************************************
319
320 //**SIMD properties*****************************************************************************
322 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
323 //**********************************************************************************************
324
325 //**Constructor*********************************************************************************
331 inline DMatDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
332 : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
333 , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
334 {
335 BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
336 }
337 //**********************************************************************************************
338
339 //**Access operator*****************************************************************************
346 inline ReturnType operator()( size_t i, size_t j ) const {
347 BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
348 BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
349
350 if( IsDiagonal_v<MT1> ) {
351 return lhs_(i,i) * rhs_(i,j);
352 }
353 else if( IsDiagonal_v<MT2> ) {
354 return lhs_(i,j) * rhs_(j,j);
355 }
356 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
357 const size_t begin( ( IsUpper_v<MT1> )
358 ?( ( IsLower_v<MT2> )
359 ?( max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
360 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
361 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
362 :( ( IsLower_v<MT2> )
363 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
364 :( 0UL ) ) );
365 const size_t end( ( IsLower_v<MT1> )
366 ?( ( IsUpper_v<MT2> )
367 ?( min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
368 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
369 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
370 :( ( IsUpper_v<MT2> )
371 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
372 :( lhs_.columns() ) ) );
373
374 if( begin >= end ) return ElementType();
375
376 const size_t n( end - begin );
377
378 return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
380 }
381 else {
382 return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
383 }
384 }
385 //**********************************************************************************************
386
387 //**At function*********************************************************************************
395 inline ReturnType at( size_t i, size_t j ) const {
396 if( i >= lhs_.rows() ) {
397 BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
398 }
399 if( j >= rhs_.columns() ) {
400 BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
401 }
402 return (*this)(i,j);
403 }
404 //**********************************************************************************************
405
406 //**Rows function*******************************************************************************
411 inline size_t rows() const noexcept {
412 return lhs_.rows();
413 }
414 //**********************************************************************************************
415
416 //**Columns function****************************************************************************
421 inline size_t columns() const noexcept {
422 return rhs_.columns();
423 }
424 //**********************************************************************************************
425
426 //**Left operand access*************************************************************************
431 inline LeftOperand leftOperand() const noexcept {
432 return lhs_;
433 }
434 //**********************************************************************************************
435
436 //**Right operand access************************************************************************
441 inline RightOperand rightOperand() const noexcept {
442 return rhs_;
443 }
444 //**********************************************************************************************
445
446 //**********************************************************************************************
452 template< typename T >
453 inline bool canAlias( const T* alias ) const noexcept {
454 return ( lhs_.canAlias( alias ) || rhs_.canAlias( alias ) );
455 }
456 //**********************************************************************************************
457
458 //**********************************************************************************************
464 template< typename T >
465 inline bool isAliased( const T* alias ) const noexcept {
466 return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
467 }
468 //**********************************************************************************************
469
470 //**********************************************************************************************
475 inline bool isAligned() const noexcept {
476 return lhs_.isAligned() && rhs_.isAligned();
477 }
478 //**********************************************************************************************
479
480 //**********************************************************************************************
485 inline bool canSMPAssign() const noexcept {
486 return ( !BLAZE_BLAS_MODE ||
487 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
489 ( rows() * columns() < DMATDMATMULT_THRESHOLD ) ) &&
490 ( rows() * columns() >= SMP_DMATDMATMULT_THRESHOLD ) &&
491 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
492 }
493 //**********************************************************************************************
494
495 private:
496 //**Member variables****************************************************************************
499 //**********************************************************************************************
500
501 //**Assignment to dense matrices****************************************************************
514 template< typename MT // Type of the target dense matrix
515 , bool SO > // Storage order of the target dense matrix
516 friend inline auto assign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
518 {
520
521 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
522 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
523
524 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
525 return;
526 }
527 else if( rhs.lhs_.columns() == 0UL ) {
528 reset( *lhs );
529 return;
530 }
531
532 LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
533 RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
534
535 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
536 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
537 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
538 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
539 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
540 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
541
542 DMatDMatMultExpr::selectAssignKernel( *lhs, A, B );
543 }
545 //**********************************************************************************************
546
547 //**Assignment to dense matrices (kernel selection)*********************************************
558 template< typename MT3 // Type of the left-hand side target matrix
559 , typename MT4 // Type of the left-hand side matrix operand
560 , typename MT5 > // Type of the right-hand side matrix operand
561 static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
562 {
563 if( ( IsDiagonal_v<MT5> ) ||
564 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
565 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
566 selectSmallAssignKernel( C, A, B );
567 else
568 selectBlasAssignKernel( C, A, B );
569 }
571 //**********************************************************************************************
572
573 //**Default assignment to dense matrices (general/general)**************************************
587 template< typename MT3 // Type of the left-hand side target matrix
588 , typename MT4 // Type of the left-hand side matrix operand
589 , typename MT5 > // Type of the right-hand side matrix operand
590 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
591 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
592 {
593 const size_t M( A.rows() );
594 const size_t N( B.columns() );
595 const size_t K( A.columns() );
596
597 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
598
599 for( size_t i=0UL; i<M; ++i )
600 {
601 const size_t kbegin( ( IsUpper_v<MT4> )
602 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
603 :( 0UL ) );
604 const size_t kend( ( IsLower_v<MT4> )
605 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
606 :( K ) );
607 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
608
609 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
610 for( size_t j=0UL; j<N; ++j ) {
611 reset( C(i,j) );
612 }
613 continue;
614 }
615
616 {
617 const size_t jbegin( ( IsUpper_v<MT5> )
618 ?( ( IsStrictlyUpper_v<MT5> )
619 ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
620 :( UPP ? max(i,kbegin) : kbegin ) )
621 :( UPP ? i : 0UL ) );
622 const size_t jend( ( IsLower_v<MT5> )
623 ?( ( IsStrictlyLower_v<MT5> )
624 ?( LOW ? min(i+1UL,kbegin) : kbegin )
625 :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
626 :( LOW ? i+1UL : N ) );
627
628 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
629 for( size_t j=0UL; j<jbegin; ++j ) {
630 reset( C(i,j) );
631 }
632 }
633 else if( IsStrictlyUpper_v<MT5> ) {
634 reset( C(i,0UL) );
635 }
636 for( size_t j=jbegin; j<jend; ++j ) {
637 C(i,j) = A(i,kbegin) * B(kbegin,j);
638 }
639 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
640 for( size_t j=jend; j<N; ++j ) {
641 reset( C(i,j) );
642 }
643 }
644 else if( IsStrictlyLower_v<MT5> ) {
645 reset( C(i,N-1UL) );
646 }
647 }
648
649 for( size_t k=kbegin+1UL; k<kend; ++k )
650 {
651 const size_t jbegin( ( IsUpper_v<MT5> )
652 ?( ( IsStrictlyUpper_v<MT5> )
653 ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
654 :( SYM || HERM || UPP ? max( i, k ) : k ) )
655 :( SYM || HERM || UPP ? i : 0UL ) );
656 const size_t jend( ( IsLower_v<MT5> )
657 ?( ( IsStrictlyLower_v<MT5> )
658 ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
659 :( LOW ? min(i+1UL,k) : k ) )
660 :( LOW ? i+1UL : N ) );
661
662 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
663 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
664
665 for( size_t j=jbegin; j<jend; ++j ) {
666 C(i,j) += A(i,k) * B(k,j);
667 }
668 if( IsLower_v<MT5> ) {
669 C(i,jend) = A(i,k) * B(k,jend);
670 }
671 }
672 }
673
674 if( SYM || HERM ) {
675 for( size_t i=1UL; i<M; ++i ) {
676 for( size_t j=0UL; j<i; ++j ) {
677 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
678 }
679 }
680 }
681 }
683 //**********************************************************************************************
684
685 //**Default assignment to dense matrices (general/diagonal)*************************************
699 template< typename MT3 // Type of the left-hand side target matrix
700 , typename MT4 // Type of the left-hand side matrix operand
701 , typename MT5 > // Type of the right-hand side matrix operand
702 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
703 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
704 {
706
707 const size_t M( A.rows() );
708 const size_t N( B.columns() );
709
710 for( size_t i=0UL; i<M; ++i )
711 {
712 const size_t jbegin( ( IsUpper_v<MT4> )
713 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
714 :( 0UL ) );
715 const size_t jend( ( IsLower_v<MT4> )
716 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
717 :( N ) );
718 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
719
720 if( IsUpper_v<MT4> ) {
721 for( size_t j=0UL; j<jbegin; ++j ) {
722 reset( C(i,j) );
723 }
724 }
725 for( size_t j=jbegin; j<jend; ++j ) {
726 C(i,j) = A(i,j) * B(j,j);
727 }
728 if( IsLower_v<MT4> ) {
729 for( size_t j=jend; j<N; ++j ) {
730 reset( C(i,j) );
731 }
732 }
733 }
734 }
736 //**********************************************************************************************
737
738 //**Default assignment to dense matrices (diagonal/general)*************************************
752 template< typename MT3 // Type of the left-hand side target matrix
753 , typename MT4 // Type of the left-hand side matrix operand
754 , typename MT5 > // Type of the right-hand side matrix operand
755 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
756 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
757 {
759
760 const size_t M( A.rows() );
761 const size_t N( B.columns() );
762
763 for( size_t i=0UL; i<M; ++i )
764 {
765 const size_t jbegin( ( IsUpper_v<MT5> )
766 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
767 :( 0UL ) );
768 const size_t jend( ( IsLower_v<MT5> )
769 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
770 :( N ) );
771 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
772
773 if( IsUpper_v<MT5> ) {
774 for( size_t j=0UL; j<jbegin; ++j ) {
775 reset( C(i,j) );
776 }
777 }
778 for( size_t j=jbegin; j<jend; ++j ) {
779 C(i,j) = A(i,i) * B(i,j);
780 }
781 if( IsLower_v<MT5> ) {
782 for( size_t j=jend; j<N; ++j ) {
783 reset( C(i,j) );
784 }
785 }
786 }
787 }
789 //**********************************************************************************************
790
791 //**Default assignment to dense matrices (diagonal/diagonal)************************************
805 template< typename MT3 // Type of the left-hand side target matrix
806 , typename MT4 // Type of the left-hand side matrix operand
807 , typename MT5 > // Type of the right-hand side matrix operand
808 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
809 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
810 {
812
813 reset( C );
814
815 for( size_t i=0UL; i<A.rows(); ++i ) {
816 C(i,i) = A(i,i) * B(i,i);
817 }
818 }
820 //**********************************************************************************************
821
822 //**Default assignment to dense matrices (small matrices)***************************************
835 template< typename MT3 // Type of the left-hand side target matrix
836 , typename MT4 // Type of the left-hand side matrix operand
837 , typename MT5 > // Type of the right-hand side matrix operand
838 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
839 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
840 {
841 selectDefaultAssignKernel( C, A, B );
842 }
844 //**********************************************************************************************
845
846 //**Vectorized default assignment to row-major dense matrices (small matrices)******************
861 template< typename MT3 // Type of the left-hand side target matrix
862 , typename MT4 // Type of the left-hand side matrix operand
863 , typename MT5 > // Type of the right-hand side matrix operand
864 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
865 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
866 {
867 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
868
869 const size_t M( A.rows() );
870 const size_t N( B.columns() );
871 const size_t K( A.columns() );
872
873 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
874
875 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
876 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
877
878 size_t j( 0UL );
879
880 if( IsIntegral_v<ElementType> )
881 {
882 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
883 for( size_t i=0UL; i<M; ++i )
884 {
885 const size_t kbegin( ( IsUpper_v<MT4> )
886 ?( ( IsLower_v<MT5> )
887 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
888 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
889 :( IsLower_v<MT5> ? j : 0UL ) );
890 const size_t kend( ( IsLower_v<MT4> )
891 ?( ( IsUpper_v<MT5> )
892 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
893 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
894 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
895
896 size_t k( kbegin );
897
898 if( k < kend )
899 {
900 SIMDType a1( set( A(i,k) ) );
901 SIMDType xmm1( a1 * B.load(k,j ) );
902 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
903 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
904 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
905 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
906 SIMDType xmm6( a1 * B.load(k,j+SIMDSIZE*5UL) );
907 SIMDType xmm7( a1 * B.load(k,j+SIMDSIZE*6UL) );
908 SIMDType xmm8( a1 * B.load(k,j+SIMDSIZE*7UL) );
909
910 for( ++k; k<kend; ++k ) {
911 a1 = set( A(i,k) );
912 xmm1 += a1 * B.load(k,j );
913 xmm2 += a1 * B.load(k,j+SIMDSIZE );
914 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
915 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
916 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
917 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
918 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
919 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
920 }
921
922 C.store( i, j , xmm1 );
923 C.store( i, j+SIMDSIZE , xmm2 );
924 C.store( i, j+SIMDSIZE*2UL, xmm3 );
925 C.store( i, j+SIMDSIZE*3UL, xmm4 );
926 C.store( i, j+SIMDSIZE*4UL, xmm5 );
927 C.store( i, j+SIMDSIZE*5UL, xmm6 );
928 C.store( i, j+SIMDSIZE*6UL, xmm7 );
929 C.store( i, j+SIMDSIZE*7UL, xmm8 );
930 }
931 else
932 {
933 const SIMDType zero;
934 C.store( i, j , zero );
935 C.store( i, j+SIMDSIZE , zero );
936 C.store( i, j+SIMDSIZE*2UL, zero );
937 C.store( i, j+SIMDSIZE*3UL, zero );
938 C.store( i, j+SIMDSIZE*4UL, zero );
939 C.store( i, j+SIMDSIZE*5UL, zero );
940 C.store( i, j+SIMDSIZE*6UL, zero );
941 C.store( i, j+SIMDSIZE*7UL, zero );
942 }
943 }
944 }
945 }
946
947 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
948 {
949 size_t i( 0UL );
950
951 for( ; (i+2UL) <= M; i+=2UL )
952 {
953 const size_t kbegin( ( IsUpper_v<MT4> )
954 ?( ( IsLower_v<MT5> )
955 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
956 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
957 :( IsLower_v<MT5> ? j : 0UL ) );
958 const size_t kend( ( IsLower_v<MT4> )
959 ?( ( IsUpper_v<MT5> )
960 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
961 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
962 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
963
964 size_t k( kbegin );
965
966 if( k < kend )
967 {
968 SIMDType a1( set( A(i ,k) ) );
969 SIMDType a2( set( A(i+1UL,k) ) );
970 SIMDType b1( B.load(k,j ) );
971 SIMDType b2( B.load(k,j+SIMDSIZE ) );
972 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
973 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
974 SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
975 SIMDType xmm1 ( a1 * b1 );
976 SIMDType xmm2 ( a1 * b2 );
977 SIMDType xmm3 ( a1 * b3 );
978 SIMDType xmm4 ( a1 * b4 );
979 SIMDType xmm5 ( a1 * b5 );
980 SIMDType xmm6 ( a2 * b1 );
981 SIMDType xmm7 ( a2 * b2 );
982 SIMDType xmm8 ( a2 * b3 );
983 SIMDType xmm9 ( a2 * b4 );
984 SIMDType xmm10( a2 * b5 );
985
986 for( ++k; k<kend; ++k ) {
987 a1 = set( A(i ,k) );
988 a2 = set( A(i+1UL,k) );
989 b1 = B.load(k,j );
990 b2 = B.load(k,j+SIMDSIZE );
991 b3 = B.load(k,j+SIMDSIZE*2UL);
992 b4 = B.load(k,j+SIMDSIZE*3UL);
993 b5 = B.load(k,j+SIMDSIZE*4UL);
994 xmm1 += a1 * b1;
995 xmm2 += a1 * b2;
996 xmm3 += a1 * b3;
997 xmm4 += a1 * b4;
998 xmm5 += a1 * b5;
999 xmm6 += a2 * b1;
1000 xmm7 += a2 * b2;
1001 xmm8 += a2 * b3;
1002 xmm9 += a2 * b4;
1003 xmm10 += a2 * b5;
1004 }
1005
1006 C.store( i , j , xmm1 );
1007 C.store( i , j+SIMDSIZE , xmm2 );
1008 C.store( i , j+SIMDSIZE*2UL, xmm3 );
1009 C.store( i , j+SIMDSIZE*3UL, xmm4 );
1010 C.store( i , j+SIMDSIZE*4UL, xmm5 );
1011 C.store( i+1UL, j , xmm6 );
1012 C.store( i+1UL, j+SIMDSIZE , xmm7 );
1013 C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
1014 C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
1015 C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
1016 }
1017 else
1018 {
1019 const SIMDType zero;
1020 C.store( i , j , zero );
1021 C.store( i , j+SIMDSIZE , zero );
1022 C.store( i , j+SIMDSIZE*2UL, zero );
1023 C.store( i , j+SIMDSIZE*3UL, zero );
1024 C.store( i , j+SIMDSIZE*4UL, zero );
1025 C.store( i+1UL, j , zero );
1026 C.store( i+1UL, j+SIMDSIZE , zero );
1027 C.store( i+1UL, j+SIMDSIZE*2UL, zero );
1028 C.store( i+1UL, j+SIMDSIZE*3UL, zero );
1029 C.store( i+1UL, j+SIMDSIZE*4UL, zero );
1030 }
1031 }
1032
1033 if( i < M )
1034 {
1035 const size_t kbegin( ( IsUpper_v<MT4> )
1036 ?( ( IsLower_v<MT5> )
1037 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1038 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1039 :( IsLower_v<MT5> ? j : 0UL ) );
1040 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
1041
1042 size_t k( kbegin );
1043
1044 if( k < kend )
1045 {
1046 SIMDType a1( set( A(i,k) ) );
1047 SIMDType xmm1( a1 * B.load(k,j ) );
1048 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
1049 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
1050 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
1051 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
1052
1053 for( ++k; k<kend; ++k ) {
1054 a1 = set( A(i,k) );
1055 xmm1 += a1 * B.load(k,j );
1056 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1057 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1058 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1059 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1060 }
1061
1062 C.store( i, j , xmm1 );
1063 C.store( i, j+SIMDSIZE , xmm2 );
1064 C.store( i, j+SIMDSIZE*2UL, xmm3 );
1065 C.store( i, j+SIMDSIZE*3UL, xmm4 );
1066 C.store( i, j+SIMDSIZE*4UL, xmm5 );
1067 }
1068 else
1069 {
1070 const SIMDType zero;
1071 C.store( i, j , zero );
1072 C.store( i, j+SIMDSIZE , zero );
1073 C.store( i, j+SIMDSIZE*2UL, zero );
1074 C.store( i, j+SIMDSIZE*3UL, zero );
1075 C.store( i, j+SIMDSIZE*4UL, zero );
1076 }
1077 }
1078 }
1079
1080 for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1081 {
1082 const size_t iend( UPP ? min(j+SIMDSIZE*4UL,M) : M );
1083 size_t i( 0UL );
1084
1085 if( SYM || HERM ) {
1086 const size_t jjend( min(j+SIMDSIZE*4UL,N) );
1087 for( ; i<j; ++i ) {
1088 for( size_t jj=j; jj<jjend; ++jj ) {
1089 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1090 }
1091 }
1092 }
1093 else if( LOW ) {
1094 const size_t jjend( min(j+SIMDSIZE*4UL,N) );
1095 for( ; i<j; ++i ) {
1096 for( size_t jj=j; jj<jjend; ++jj ) {
1097 reset( C(i,jj) );
1098 }
1099 }
1100 }
1101
1102 for( ; (i+2UL) <= iend; i+=2UL )
1103 {
1104 const size_t kbegin( ( IsUpper_v<MT4> )
1105 ?( ( IsLower_v<MT5> )
1106 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1107 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1108 :( IsLower_v<MT5> ? j : 0UL ) );
1109 const size_t kend( ( IsLower_v<MT4> )
1110 ?( ( IsUpper_v<MT5> )
1111 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1112 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1113 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
1114
1115 size_t k( kbegin );
1116
1117 if( k < kend )
1118 {
1119 SIMDType a1( set( A(i ,k) ) );
1120 SIMDType a2( set( A(i+1UL,k) ) );
1121 SIMDType b1( B.load(k,j ) );
1122 SIMDType b2( B.load(k,j+SIMDSIZE ) );
1123 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1124 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1125 SIMDType xmm1( a1 * b1 );
1126 SIMDType xmm2( a1 * b2 );
1127 SIMDType xmm3( a1 * b3 );
1128 SIMDType xmm4( a1 * b4 );
1129 SIMDType xmm5( a2 * b1 );
1130 SIMDType xmm6( a2 * b2 );
1131 SIMDType xmm7( a2 * b3 );
1132 SIMDType xmm8( a2 * b4 );
1133
1134 for( ++k; k<kend; ++k ) {
1135 a1 = set( A(i ,k) );
1136 a2 = set( A(i+1UL,k) );
1137 b1 = B.load(k,j );
1138 b2 = B.load(k,j+SIMDSIZE );
1139 b3 = B.load(k,j+SIMDSIZE*2UL);
1140 b4 = B.load(k,j+SIMDSIZE*3UL);
1141 xmm1 += a1 * b1;
1142 xmm2 += a1 * b2;
1143 xmm3 += a1 * b3;
1144 xmm4 += a1 * b4;
1145 xmm5 += a2 * b1;
1146 xmm6 += a2 * b2;
1147 xmm7 += a2 * b3;
1148 xmm8 += a2 * b4;
1149 }
1150
1151 C.store( i , j , xmm1 );
1152 C.store( i , j+SIMDSIZE , xmm2 );
1153 C.store( i , j+SIMDSIZE*2UL, xmm3 );
1154 C.store( i , j+SIMDSIZE*3UL, xmm4 );
1155 C.store( i+1UL, j , xmm5 );
1156 C.store( i+1UL, j+SIMDSIZE , xmm6 );
1157 C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1158 C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1159 }
1160 else
1161 {
1162 SIMDType zero;
1163 C.store( i , j , zero );
1164 C.store( i , j+SIMDSIZE , zero );
1165 C.store( i , j+SIMDSIZE*2UL, zero );
1166 C.store( i , j+SIMDSIZE*3UL, zero );
1167 C.store( i+1UL, j , zero );
1168 C.store( i+1UL, j+SIMDSIZE , zero );
1169 C.store( i+1UL, j+SIMDSIZE*2UL, zero );
1170 C.store( i+1UL, j+SIMDSIZE*3UL, zero );
1171 }
1172 }
1173
1174 if( i < iend )
1175 {
1176 const size_t kbegin( ( IsUpper_v<MT4> )
1177 ?( ( IsLower_v<MT5> )
1178 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1179 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1180 :( IsLower_v<MT5> ? j : 0UL ) );
1181 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
1182
1183 size_t k( kbegin );
1184
1185 if( k < kend )
1186 {
1187 SIMDType a1( set( A(i,k) ) );
1188 SIMDType xmm1( a1 * B.load(k,j ) );
1189 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
1190 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
1191 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
1192
1193 for( ++k; k<kend; ++k ) {
1194 a1 = set( A(i,k) );
1195 xmm1 += a1 * B.load(k,j );
1196 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1197 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1198 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1199 }
1200
1201 C.store( i, j , xmm1 );
1202 C.store( i, j+SIMDSIZE , xmm2 );
1203 C.store( i, j+SIMDSIZE*2UL, xmm3 );
1204 C.store( i, j+SIMDSIZE*3UL, xmm4 );
1205 }
1206 else
1207 {
1208 const SIMDType zero;
1209 C.store( i, j , zero );
1210 C.store( i, j+SIMDSIZE , zero );
1211 C.store( i, j+SIMDSIZE*2UL, zero );
1212 C.store( i, j+SIMDSIZE*3UL, zero );
1213 }
1214
1215 if( UPP ) ++i;
1216 }
1217
1218 if( UPP ) {
1219 const size_t jjend( min(j+SIMDSIZE*4UL,N) );
1220 for( ; i<M; ++i ) {
1221 for( size_t jj=j; jj<jjend; ++jj ) {
1222 reset( C(i,jj) );
1223 }
1224 }
1225 }
1226 }
1227
1228 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1229 {
1230 const size_t iend( UPP ? min(j+SIMDSIZE*3UL,M) : M );
1231 size_t i( 0UL );
1232
1233 if( SYM || HERM ) {
1234 const size_t jjend( min(j+SIMDSIZE*3UL,N) );
1235 for( ; i<j; ++i ) {
1236 for( size_t jj=j; jj<jjend; ++jj ) {
1237 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1238 }
1239 }
1240 }
1241 else if( LOW ) {
1242 const size_t jjend( min(j+SIMDSIZE*3UL,N) );
1243 for( ; i<j; ++i ) {
1244 for( size_t jj=j; jj<jjend; ++jj ) {
1245 reset( C(i,jj) );
1246 }
1247 }
1248 }
1249
1250 for( ; (i+2UL) <= iend; i+=2UL )
1251 {
1252 const size_t kbegin( ( IsUpper_v<MT4> )
1253 ?( ( IsLower_v<MT5> )
1254 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1255 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1256 :( IsLower_v<MT5> ? j : 0UL ) );
1257 const size_t kend( ( IsLower_v<MT4> )
1258 ?( ( IsUpper_v<MT5> )
1259 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
1260 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1261 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
1262
1263 size_t k( kbegin );
1264
1265 if( k < kend )
1266 {
1267 SIMDType a1( set( A(i ,k) ) );
1268 SIMDType a2( set( A(i+1UL,k) ) );
1269 SIMDType b1( B.load(k,j ) );
1270 SIMDType b2( B.load(k,j+SIMDSIZE ) );
1271 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1272 SIMDType xmm1( a1 * b1 );
1273 SIMDType xmm2( a1 * b2 );
1274 SIMDType xmm3( a1 * b3 );
1275 SIMDType xmm4( a2 * b1 );
1276 SIMDType xmm5( a2 * b2 );
1277 SIMDType xmm6( a2 * b3 );
1278
1279 for( ++k; k<kend; ++k ) {
1280 a1 = set( A(i ,k) );
1281 a2 = set( A(i+1UL,k) );
1282 b1 = B.load(k,j );
1283 b2 = B.load(k,j+SIMDSIZE );
1284 b3 = B.load(k,j+SIMDSIZE*2UL);
1285 xmm1 += a1 * b1;
1286 xmm2 += a1 * b2;
1287 xmm3 += a1 * b3;
1288 xmm4 += a2 * b1;
1289 xmm5 += a2 * b2;
1290 xmm6 += a2 * b3;
1291 }
1292
1293 C.store( i , j , xmm1 );
1294 C.store( i , j+SIMDSIZE , xmm2 );
1295 C.store( i , j+SIMDSIZE*2UL, xmm3 );
1296 C.store( i+1UL, j , xmm4 );
1297 C.store( i+1UL, j+SIMDSIZE , xmm5 );
1298 C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1299 }
1300 else
1301 {
1302 const SIMDType zero;
1303 C.store( i , j , zero );
1304 C.store( i , j+SIMDSIZE , zero );
1305 C.store( i , j+SIMDSIZE*2UL, zero );
1306 C.store( i+1UL, j , zero );
1307 C.store( i+1UL, j+SIMDSIZE , zero );
1308 C.store( i+1UL, j+SIMDSIZE*2UL, zero );
1309 }
1310 }
1311
1312 if( i < iend )
1313 {
1314 const size_t kbegin( ( IsUpper_v<MT4> )
1315 ?( ( IsLower_v<MT5> )
1316 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1317 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1318 :( IsLower_v<MT5> ? j : 0UL ) );
1319 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
1320
1321 size_t k( kbegin );
1322
1323 if( k < kend )
1324 {
1325 SIMDType a1( set( A(i,k) ) );
1326 SIMDType xmm1( a1 * B.load(k,j ) );
1327 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
1328 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
1329
1330 for( ++k; k<kend; ++k ) {
1331 a1 = set( A(i,k) );
1332 xmm1 += a1 * B.load(k,j );
1333 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1334 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1335 }
1336
1337 C.store( i, j , xmm1 );
1338 C.store( i, j+SIMDSIZE , xmm2 );
1339 C.store( i, j+SIMDSIZE*2UL, xmm3 );
1340 }
1341 else
1342 {
1343 const SIMDType zero;
1344 C.store( i, j , zero );
1345 C.store( i, j+SIMDSIZE , zero );
1346 C.store( i, j+SIMDSIZE*2UL, zero );
1347 }
1348
1349 if( UPP ) ++i;
1350 }
1351
1352 if( UPP ) {
1353 const size_t jjend( min(j+SIMDSIZE*3UL,N) );
1354 for( ; i<M; ++i ) {
1355 for( size_t jj=j; jj<jjend; ++jj ) {
1356 reset( C(i,jj) );
1357 }
1358 }
1359 }
1360 }
1361
1362 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1363 {
1364 const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
1365 size_t i( 0UL );
1366
1367 if( SYM || HERM ) {
1368 const size_t jjend( min(j+SIMDSIZE*2UL,N) );
1369 for( ; i<j; ++i ) {
1370 for( size_t jj=j; jj<jjend; ++jj ) {
1371 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1372 }
1373 }
1374 }
1375 else if( LOW ) {
1376 const size_t jjend( min(j+SIMDSIZE*2UL,N) );
1377 for( ; i<j; ++i ) {
1378 for( size_t jj=j; jj<jjend; ++jj ) {
1379 reset( C(i,jj) );
1380 }
1381 }
1382 }
1383
1384 for( ; (i+4UL) <= iend; i+=4UL )
1385 {
1386 const size_t kbegin( ( IsUpper_v<MT4> )
1387 ?( ( IsLower_v<MT5> )
1388 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1389 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1390 :( IsLower_v<MT5> ? j : 0UL ) );
1391 const size_t kend( ( IsLower_v<MT4> )
1392 ?( ( IsUpper_v<MT5> )
1393 ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
1394 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
1395 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1396
1397 size_t k( kbegin );
1398
1399 if( k < kend )
1400 {
1401 SIMDType a1( set( A(i ,k) ) );
1402 SIMDType a2( set( A(i+1UL,k) ) );
1403 SIMDType a3( set( A(i+2UL,k) ) );
1404 SIMDType a4( set( A(i+3UL,k) ) );
1405 SIMDType b1( B.load(k,j ) );
1406 SIMDType b2( B.load(k,j+SIMDSIZE) );
1407 SIMDType xmm1( a1 * b1 );
1408 SIMDType xmm2( a1 * b2 );
1409 SIMDType xmm3( a2 * b1 );
1410 SIMDType xmm4( a2 * b2 );
1411 SIMDType xmm5( a3 * b1 );
1412 SIMDType xmm6( a3 * b2 );
1413 SIMDType xmm7( a4 * b1 );
1414 SIMDType xmm8( a4 * b2 );
1415
1416 for( ++k; k<kend; ++k ) {
1417 a1 = set( A(i ,k) );
1418 a2 = set( A(i+1UL,k) );
1419 a3 = set( A(i+2UL,k) );
1420 a4 = set( A(i+3UL,k) );
1421 b1 = B.load(k,j );
1422 b2 = B.load(k,j+SIMDSIZE);
1423 xmm1 += a1 * b1;
1424 xmm2 += a1 * b2;
1425 xmm3 += a2 * b1;
1426 xmm4 += a2 * b2;
1427 xmm5 += a3 * b1;
1428 xmm6 += a3 * b2;
1429 xmm7 += a4 * b1;
1430 xmm8 += a4 * b2;
1431 }
1432
1433 C.store( i , j , xmm1 );
1434 C.store( i , j+SIMDSIZE, xmm2 );
1435 C.store( i+1UL, j , xmm3 );
1436 C.store( i+1UL, j+SIMDSIZE, xmm4 );
1437 C.store( i+2UL, j , xmm5 );
1438 C.store( i+2UL, j+SIMDSIZE, xmm6 );
1439 C.store( i+3UL, j , xmm7 );
1440 C.store( i+3UL, j+SIMDSIZE, xmm8 );
1441 }
1442 else
1443 {
1444 const SIMDType zero;
1445 C.store( i , j , zero );
1446 C.store( i , j+SIMDSIZE, zero );
1447 C.store( i+1UL, j , zero );
1448 C.store( i+1UL, j+SIMDSIZE, zero );
1449 C.store( i+2UL, j , zero );
1450 C.store( i+2UL, j+SIMDSIZE, zero );
1451 C.store( i+3UL, j , zero );
1452 C.store( i+3UL, j+SIMDSIZE, zero );
1453 }
1454 }
1455
1456 for( ; (i+3UL) <= iend; i+=3UL )
1457 {
1458 const size_t kbegin( ( IsUpper_v<MT4> )
1459 ?( ( IsLower_v<MT5> )
1460 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1461 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1462 :( IsLower_v<MT5> ? j : 0UL ) );
1463 const size_t kend( ( IsLower_v<MT4> )
1464 ?( ( IsUpper_v<MT5> )
1465 ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
1466 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
1467 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1468
1469 size_t k( kbegin );
1470
1471 if( k < kend )
1472 {
1473 SIMDType a1( set( A(i ,k) ) );
1474 SIMDType a2( set( A(i+1UL,k) ) );
1475 SIMDType a3( set( A(i+2UL,k) ) );
1476 SIMDType b1( B.load(k,j ) );
1477 SIMDType b2( B.load(k,j+SIMDSIZE) );
1478 SIMDType xmm1( a1 * b1 );
1479 SIMDType xmm2( a1 * b2 );
1480 SIMDType xmm3( a2 * b1 );
1481 SIMDType xmm4( a2 * b2 );
1482 SIMDType xmm5( a3 * b1 );
1483 SIMDType xmm6( a3 * b2 );
1484
1485 for( ++k; k<kend; ++k ) {
1486 a1 = set( A(i ,k) );
1487 a2 = set( A(i+1UL,k) );
1488 a3 = set( A(i+2UL,k) );
1489 b1 = B.load(k,j );
1490 b2 = B.load(k,j+SIMDSIZE);
1491 xmm1 += a1 * b1;
1492 xmm2 += a1 * b2;
1493 xmm3 += a2 * b1;
1494 xmm4 += a2 * b2;
1495 xmm5 += a3 * b1;
1496 xmm6 += a3 * b2;
1497 }
1498
1499 C.store( i , j , xmm1 );
1500 C.store( i , j+SIMDSIZE, xmm2 );
1501 C.store( i+1UL, j , xmm3 );
1502 C.store( i+1UL, j+SIMDSIZE, xmm4 );
1503 C.store( i+2UL, j , xmm5 );
1504 C.store( i+2UL, j+SIMDSIZE, xmm6 );
1505 }
1506 else
1507 {
1508 const SIMDType zero;
1509 C.store( i , j , zero );
1510 C.store( i , j+SIMDSIZE, zero );
1511 C.store( i+1UL, j , zero );
1512 C.store( i+1UL, j+SIMDSIZE, zero );
1513 C.store( i+2UL, j , zero );
1514 C.store( i+2UL, j+SIMDSIZE, zero );
1515 }
1516 }
1517
1518 for( ; (i+2UL) <= iend; i+=2UL )
1519 {
1520 const size_t kbegin( ( IsUpper_v<MT4> )
1521 ?( ( IsLower_v<MT5> )
1522 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1523 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1524 :( IsLower_v<MT5> ? j : 0UL ) );
1525 const size_t kend( ( IsLower_v<MT4> )
1526 ?( ( IsUpper_v<MT5> )
1527 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
1528 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1529 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1530
1531 size_t k( kbegin );
1532
1533 if( k < kend )
1534 {
1535 SIMDType a1( set( A(i ,k) ) );
1536 SIMDType a2( set( A(i+1UL,k) ) );
1537 SIMDType b1( B.load(k,j ) );
1538 SIMDType b2( B.load(k,j+SIMDSIZE) );
1539 SIMDType xmm1( a1 * b1 );
1540 SIMDType xmm2( a1 * b2 );
1541 SIMDType xmm3( a2 * b1 );
1542 SIMDType xmm4( a2 * b2 );
1543
1544 for( ++k; k<kend; ++k ) {
1545 a1 = set( A(i ,k) );
1546 a2 = set( A(i+1UL,k) );
1547 b1 = B.load(k,j );
1548 b2 = B.load(k,j+SIMDSIZE);
1549 xmm1 += a1 * b1;
1550 xmm2 += a1 * b2;
1551 xmm3 += a2 * b1;
1552 xmm4 += a2 * b2;
1553 }
1554
1555 C.store( i , j , xmm1 );
1556 C.store( i , j+SIMDSIZE, xmm2 );
1557 C.store( i+1UL, j , xmm3 );
1558 C.store( i+1UL, j+SIMDSIZE, xmm4 );
1559 }
1560 else
1561 {
1562 const SIMDType zero;
1563 C.store( i , j , zero );
1564 C.store( i , j+SIMDSIZE, zero );
1565 C.store( i+1UL, j , zero );
1566 C.store( i+1UL, j+SIMDSIZE, zero );
1567 }
1568 }
1569
1570 if( i < iend )
1571 {
1572 const size_t kbegin( ( IsUpper_v<MT4> )
1573 ?( ( IsLower_v<MT5> )
1574 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1575 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1576 :( IsLower_v<MT5> ? j : 0UL ) );
1577 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
1578
1579 size_t k( kbegin );
1580
1581 if( k < kend )
1582 {
1583 SIMDType a1( set( A(i,k) ) );
1584 SIMDType xmm1( a1 * B.load(k,j ) );
1585 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE) );
1586
1587 for( ++k; k<kend; ++k ) {
1588 a1 = set( A(i,k) );
1589 xmm1 += a1 * B.load(k,j );
1590 xmm2 += a1 * B.load(k,j+SIMDSIZE);
1591 }
1592
1593 C.store( i, j , xmm1 );
1594 C.store( i, j+SIMDSIZE, xmm2 );
1595 }
1596 else
1597 {
1598 const SIMDType zero;
1599 C.store( i, j , zero );
1600 C.store( i, j+SIMDSIZE, zero );
1601 }
1602
1603 if( UPP ) ++i;
1604 }
1605
1606 if( UPP ) {
1607 const size_t jjend( min(j+SIMDSIZE*2UL,N) );
1608 for( ; i<M; ++i ) {
1609 for( size_t jj=j; jj<jjend; ++jj ) {
1610 reset( C(i,jj) );
1611 }
1612 }
1613 }
1614 }
1615
1616 for( ; j<jpos; j+=SIMDSIZE )
1617 {
1618 const size_t iend( UPP ? min(j+SIMDSIZE,M) : M );
1619 size_t i( 0UL );
1620
1621 if( SYM || HERM ) {
1622 const size_t jjend( min(j+SIMDSIZE,N) );
1623 for( ; i<j; ++i ) {
1624 for( size_t jj=j; jj<jjend; ++jj ) {
1625 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1626 }
1627 }
1628 }
1629 else if( LOW ) {
1630 const size_t jjend( min(j+SIMDSIZE,N) );
1631 for( ; i<j; ++i ) {
1632 for( size_t jj=j; jj<jjend; ++jj ) {
1633 reset( C(i,jj) );
1634 }
1635 }
1636 }
1637
1638 for( ; (i+4UL) <= iend; i+=4UL )
1639 {
1640 const size_t kbegin( ( IsUpper_v<MT4> )
1641 ?( ( IsLower_v<MT5> )
1642 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1643 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1644 :( IsLower_v<MT5> ? j : 0UL ) );
1645 const size_t kend( ( IsLower_v<MT4> )
1646 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
1647 :( K ) );
1648
1649 size_t k( kbegin );
1650
1651 if( k < kend )
1652 {
1653 SIMDType b1( B.load(k,j) );
1654 SIMDType xmm1( set( A(i ,k) ) * b1 );
1655 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
1656 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
1657 SIMDType xmm4( set( A(i+3UL,k) ) * b1 );
1658
1659 for( ++k; k<kend; ++k ) {
1660 b1 = B.load(k,j);
1661 xmm1 += set( A(i ,k) ) * b1;
1662 xmm2 += set( A(i+1UL,k) ) * b1;
1663 xmm3 += set( A(i+2UL,k) ) * b1;
1664 xmm4 += set( A(i+3UL,k) ) * b1;
1665 }
1666
1667 C.store( i , j, xmm1 );
1668 C.store( i+1UL, j, xmm2 );
1669 C.store( i+2UL, j, xmm3 );
1670 C.store( i+3UL, j, xmm4 );
1671 }
1672 else
1673 {
1674 const SIMDType zero;
1675 C.store( i , j, zero );
1676 C.store( i+1UL, j, zero );
1677 C.store( i+2UL, j, zero );
1678 C.store( i+3UL, j, zero );
1679 }
1680 }
1681
1682 for( ; (i+3UL) <= iend; i+=3UL )
1683 {
1684 const size_t kbegin( ( IsUpper_v<MT4> )
1685 ?( ( IsLower_v<MT5> )
1686 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1687 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1688 :( IsLower_v<MT5> ? j : 0UL ) );
1689 const size_t kend( ( IsLower_v<MT4> )
1690 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
1691 :( K ) );
1692
1693 size_t k( kbegin );
1694
1695 if( k < kend )
1696 {
1697 SIMDType b1( B.load(k,j) );
1698 SIMDType xmm1( set( A(i ,k) ) * b1 );
1699 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
1700 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
1701
1702 for( ++k; k<kend; ++k ) {
1703 b1 = B.load(k,j);
1704 xmm1 += set( A(i ,k) ) * b1;
1705 xmm2 += set( A(i+1UL,k) ) * b1;
1706 xmm3 += set( A(i+2UL,k) ) * b1;
1707 }
1708
1709 C.store( i , j, xmm1 );
1710 C.store( i+1UL, j, xmm2 );
1711 C.store( i+2UL, j, xmm3 );
1712 }
1713 else
1714 {
1715 C.store( i , j, SIMDType() );
1716 C.store( i+1UL, j, SIMDType() );
1717 C.store( i+2UL, j, SIMDType() );
1718 }
1719 }
1720
1721 for( ; (i+2UL) <= iend; i+=2UL )
1722 {
1723 const size_t kbegin( ( IsUpper_v<MT4> )
1724 ?( ( IsLower_v<MT5> )
1725 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1726 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1727 :( IsLower_v<MT5> ? j : 0UL ) );
1728 const size_t kend( ( IsLower_v<MT4> )
1729 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1730 :( K ) );
1731
1732 size_t k( kbegin );
1733
1734 if( k < kend )
1735 {
1736 SIMDType b1( B.load(k,j) );
1737 SIMDType xmm1( set( A(i ,k) ) * b1 );
1738 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
1739
1740 for( ++k; k<kend; ++k ) {
1741 b1 = B.load(k,j);
1742 xmm1 += set( A(i ,k) ) * b1;
1743 xmm2 += set( A(i+1UL,k) ) * b1;
1744 }
1745
1746 C.store( i , j, xmm1 );
1747 C.store( i+1UL, j, xmm2 );
1748 }
1749 else
1750 {
1751 const SIMDType zero;
1752 C.store( i , j, zero );
1753 C.store( i+1UL, j, zero );
1754 }
1755 }
1756
1757 if( i < iend )
1758 {
1759 const size_t kbegin( ( IsUpper_v<MT4> )
1760 ?( ( IsLower_v<MT5> )
1761 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1762 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1763 :( IsLower_v<MT5> ? j : 0UL ) );
1764
1765 size_t k( kbegin );
1766
1767 if( k < K )
1768 {
1769 SIMDType xmm1( set( A(i,k) ) * B.load(k,j) );
1770
1771 for( ++k; k<K; ++k ) {
1772 xmm1 += set( A(i,k) ) * B.load(k,j);
1773 }
1774
1775 C.store( i, j, xmm1 );
1776 }
1777 else
1778 {
1779 const SIMDType zero;
1780 C.store( i, j, zero );
1781 }
1782
1783 if( UPP ) ++i;
1784 }
1785
1786 if( UPP ) {
1787 const size_t jjend( min(j+SIMDSIZE,N) );
1788 for( ; i<M; ++i ) {
1789 for( size_t jj=j; jj<jjend; ++jj ) {
1790 reset( C(i,jj) );
1791 }
1792 }
1793 }
1794 }
1795
1796 for( ; remainder && j<N; ++j )
1797 {
1798 size_t i( 0UL );
1799
1800 if( SYM || HERM ) {
1801 for( ; i<j; ++i ) {
1802 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1803 }
1804 }
1805 else if( LOW ) {
1806 for( ; i<j; ++i ) {
1807 reset( C(i,j) );
1808 }
1809 }
1810
1811 for( ; (i+2UL) <= M; i+=2UL )
1812 {
1813 const size_t kbegin( ( IsUpper_v<MT4> )
1814 ?( ( IsLower_v<MT5> )
1815 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1816 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1817 :( IsLower_v<MT5> ? j : 0UL ) );
1818 const size_t kend( ( IsLower_v<MT4> )
1819 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1820 :( K ) );
1821
1822 size_t k( kbegin );
1823
1824 if( k < kend )
1825 {
1826 ElementType value1( A(i ,k) * B(k,j) );
1827 ElementType value2( A(i+1UL,k) * B(k,j) );
1828
1829 for( ++k; k<kend; ++k ) {
1830 value1 += A(i ,k) * B(k,j);
1831 value2 += A(i+1UL,k) * B(k,j);
1832 }
1833
1834 C(i ,j) = value1;
1835 C(i+1UL,j) = value2;
1836 }
1837 else
1838 {
1839 reset( C(i ,j) );
1840 reset( C(i+1UL,j) );
1841 }
1842 }
1843
1844 if( i < M )
1845 {
1846 const size_t kbegin( ( IsUpper_v<MT4> )
1847 ?( ( IsLower_v<MT5> )
1848 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1849 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1850 :( IsLower_v<MT5> ? j : 0UL ) );
1851
1852 size_t k( kbegin );
1853
1854 if( k < K )
1855 {
1856 ElementType value( A(i,k) * B(k,j) );
1857
1858 for( ++k; k<K; ++k ) {
1859 value += A(i,k) * B(k,j);
1860 }
1861
1862 C(i,j) = value;
1863 }
1864 else
1865 {
1866 reset( C(i,j) );
1867 }
1868 }
1869 }
1870 }
1872 //**********************************************************************************************
1873
1874 //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1889 template< typename MT3 // Type of the left-hand side target matrix
1890 , typename MT4 // Type of the left-hand side matrix operand
1891 , typename MT5 > // Type of the right-hand side matrix operand
1892 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1893 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1894 {
1899
1900 const ForwardFunctor fwd;
1901
1902 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
1903 const OppositeType_t<MT4> tmp( serial( A ) );
1904 assign( C, fwd( tmp * B ) );
1905 }
1906 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
1907 const OppositeType_t<MT5> tmp( serial( B ) );
1908 assign( C, fwd( A * tmp ) );
1909 }
1910 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1911 const OppositeType_t<MT4> tmp( serial( A ) );
1912 assign( C, fwd( tmp * B ) );
1913 }
1914 else {
1915 const OppositeType_t<MT5> tmp( serial( B ) );
1916 assign( C, fwd( A * tmp ) );
1917 }
1918 }
1920 //**********************************************************************************************
1921
1922 //**Default assignment to dense matrices (large matrices)***************************************
1935 template< typename MT3 // Type of the left-hand side target matrix
1936 , typename MT4 // Type of the left-hand side matrix operand
1937 , typename MT5 > // Type of the right-hand side matrix operand
1938 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1939 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1940 {
1941 selectDefaultAssignKernel( C, A, B );
1942 }
1944 //**********************************************************************************************
1945
1946 //**Vectorized default assignment to dense matrices (large matrices)****************************
1960 template< typename MT3 // Type of the left-hand side target matrix
1961 , typename MT4 // Type of the left-hand side matrix operand
1962 , typename MT5 > // Type of the right-hand side matrix operand
1963 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1964 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1965 {
1966 if( SYM )
1967 smmm( C, A, B, ElementType(1) );
1968 else if( HERM )
1969 hmmm( C, A, B, ElementType(1) );
1970 else if( LOW )
1971 lmmm( C, A, B, ElementType(1), ElementType(0) );
1972 else if( UPP )
1973 ummm( C, A, B, ElementType(1), ElementType(0) );
1974 else
1975 mmm( C, A, B, ElementType(1), ElementType(0) );
1976 }
1978 //**********************************************************************************************
1979
1980 //**BLAS-based assignment to dense matrices (default)*******************************************
1993 template< typename MT3 // Type of the left-hand side target matrix
1994 , typename MT4 // Type of the left-hand side matrix operand
1995 , typename MT5 > // Type of the right-hand side matrix operand
1996 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1997 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1998 {
1999 selectLargeAssignKernel( C, A, B );
2000 }
2002 //**********************************************************************************************
2003
2004 //**BLAS-based assignment to dense matrices*****************************************************
2005#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2018 template< typename MT3 // Type of the left-hand side target matrix
2019 , typename MT4 // Type of the left-hand side matrix operand
2020 , typename MT5 > // Type of the right-hand side matrix operand
2021 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2022 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2023 {
2024 using ET = ElementType_t<MT3>;
2025
2026 if( IsTriangular_v<MT4> ) {
2027 assign( C, B );
2028 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2029 }
2030 else if( IsTriangular_v<MT5> ) {
2031 assign( C, A );
2032 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2033 }
2034 else {
2035 gemm( C, A, B, ET(1), ET(0) );
2036 }
2037 }
2039#endif
2040 //**********************************************************************************************
2041
2042 //**Assignment to sparse matrices***************************************************************
2055 template< typename MT // Type of the target sparse matrix
2056 , bool SO > // Storage order of the target sparse matrix
2057 friend inline auto assign( SparseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
2058 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2059 {
2061
2062 using TmpType = If_t< SO, OppositeType, ResultType >;
2063
2070
2071 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
2072 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
2073
2074 const ForwardFunctor fwd;
2075
2076 const TmpType tmp( serial( rhs ) );
2077 assign( *lhs, fwd( tmp ) );
2078 }
2080 //**********************************************************************************************
2081
2082 //**Restructuring assignment to column-major matrices*******************************************
2097 template< typename MT > // Type of the target matrix
2098 friend inline auto assign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
2099 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2100 {
2102
2104
2105 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
2106 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
2107
2108 const ForwardFunctor fwd;
2109
2110 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
2111 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
2112
2113 assign( *lhs, fwd( A * B ) );
2114 }
2116 //**********************************************************************************************
2117
2118 //**Addition assignment to dense matrices*******************************************************
2131 template< typename MT // Type of the target dense matrix
2132 , bool SO > // Storage order of the target dense matrix
2133 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
2134 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2135 {
2137
2138 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
2139 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
2140
2141 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2142 return;
2143 }
2144
2145 LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2146 RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2147
2148 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2149 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2150 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2151 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2152 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
2153 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
2154
2155 DMatDMatMultExpr::selectAddAssignKernel( *lhs, A, B );
2156 }
2158 //**********************************************************************************************
2159
2160 //**Addition assignment to dense matrices (kernel selection)************************************
2171 template< typename MT3 // Type of the left-hand side target matrix
2172 , typename MT4 // Type of the left-hand side matrix operand
2173 , typename MT5 > // Type of the right-hand side matrix operand
2174 static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2175 {
2176 if( ( IsDiagonal_v<MT5> ) ||
2177 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
2178 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
2179 selectSmallAddAssignKernel( C, A, B );
2180 else
2181 selectBlasAddAssignKernel( C, A, B );
2182 }
2184 //**********************************************************************************************
2185
2186 //**Default addition assignment to dense matrices (general/general)*****************************
2200 template< typename MT3 // Type of the left-hand side target matrix
2201 , typename MT4 // Type of the left-hand side matrix operand
2202 , typename MT5 > // Type of the right-hand side matrix operand
2203 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2204 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2205 {
2206 const size_t M( A.rows() );
2207 const size_t N( B.columns() );
2208 const size_t K( A.columns() );
2209
2210 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2211
2212 for( size_t i=0UL; i<M; ++i )
2213 {
2214 const size_t kbegin( ( IsUpper_v<MT4> )
2215 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2216 :( 0UL ) );
2217 const size_t kend( ( IsLower_v<MT4> )
2218 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2219 :( K ) );
2220 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2221
2222 for( size_t k=kbegin; k<kend; ++k )
2223 {
2224 const size_t jbegin( ( IsUpper_v<MT5> )
2225 ?( ( IsStrictlyUpper_v<MT5> )
2226 ?( UPP ? max(i,k+1UL) : k+1UL )
2227 :( UPP ? max(i,k) : k ) )
2228 :( UPP ? i : 0UL ) );
2229 const size_t jend( ( IsLower_v<MT5> )
2230 ?( ( IsStrictlyLower_v<MT5> )
2231 ?( LOW ? min(i+1UL,k) : k )
2232 :( LOW ? min(i,k)+1UL : k+1UL ) )
2233 :( LOW ? i+1UL : N ) );
2234
2235 if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
2236 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2237
2238 const size_t jnum( jend - jbegin );
2239 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
2240 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
2241
2242 for( size_t j=jbegin; j<jpos; j+=2UL ) {
2243 C(i,j ) += A(i,k) * B(k,j );
2244 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2245 }
2246 if( jpos < jend ) {
2247 C(i,jpos) += A(i,k) * B(k,jpos);
2248 }
2249 }
2250 }
2251 }
2253 //**********************************************************************************************
2254
2255 //**Default addition assignment to dense matrices (general/diagonal)****************************
2269 template< typename MT3 // Type of the left-hand side target matrix
2270 , typename MT4 // Type of the left-hand side matrix operand
2271 , typename MT5 > // Type of the right-hand side matrix operand
2272 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2273 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2274 {
2276
2277 const size_t M( A.rows() );
2278 const size_t N( B.columns() );
2279
2280 for( size_t i=0UL; i<M; ++i )
2281 {
2282 const size_t jbegin( ( IsUpper_v<MT4> )
2283 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2284 :( 0UL ) );
2285 const size_t jend( ( IsLower_v<MT4> )
2286 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2287 :( N ) );
2288 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2289
2290 const size_t jnum( jend - jbegin );
2291 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
2292 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
2293
2294 for( size_t j=jbegin; j<jpos; j+=2UL ) {
2295 C(i,j ) += A(i,j ) * B(j ,j );
2296 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
2297 }
2298 if( jpos < jend ) {
2299 C(i,jpos) += A(i,jpos) * B(jpos,jpos);
2300 }
2301 }
2302 }
2304 //**********************************************************************************************
2305
2306 //**Default addition assignment to dense matrices (diagonal/general)****************************
2320 template< typename MT3 // Type of the left-hand side target matrix
2321 , typename MT4 // Type of the left-hand side matrix operand
2322 , typename MT5 > // Type of the right-hand side matrix operand
2323 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2324 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2325 {
2327
2328 const size_t M( A.rows() );
2329 const size_t N( B.columns() );
2330
2331 for( size_t i=0UL; i<M; ++i )
2332 {
2333 const size_t jbegin( ( IsUpper_v<MT5> )
2334 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
2335 :( 0UL ) );
2336 const size_t jend( ( IsLower_v<MT5> )
2337 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
2338 :( N ) );
2339 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2340
2341 const size_t jnum( jend - jbegin );
2342 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
2343 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
2344
2345 for( size_t j=jbegin; j<jpos; j+=2UL ) {
2346 C(i,j ) += A(i,i) * B(i,j );
2347 C(i,j+1UL) += A(i,i) * B(i,j+1UL);
2348 }
2349 if( jpos < jend ) {
2350 C(i,jpos) += A(i,i) * B(i,jpos);
2351 }
2352 }
2353 }
2355 //**********************************************************************************************
2356
2357 //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2371 template< typename MT3 // Type of the left-hand side target matrix
2372 , typename MT4 // Type of the left-hand side matrix operand
2373 , typename MT5 > // Type of the right-hand side matrix operand
2374 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2375 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2376 {
2378
2379 for( size_t i=0UL; i<A.rows(); ++i ) {
2380 C(i,i) += A(i,i) * B(i,i);
2381 }
2382 }
2384 //**********************************************************************************************
2385
2386 //**Default addition assignment to dense matrices (small matrices)******************************
2400 template< typename MT3 // Type of the left-hand side target matrix
2401 , typename MT4 // Type of the left-hand side matrix operand
2402 , typename MT5 > // Type of the right-hand side matrix operand
2403 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2404 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2405 {
2406 selectDefaultAddAssignKernel( C, A, B );
2407 }
2409 //**********************************************************************************************
2410
2411 //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2426 template< typename MT3 // Type of the left-hand side target matrix
2427 , typename MT4 // Type of the left-hand side matrix operand
2428 , typename MT5 > // Type of the right-hand side matrix operand
2429 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2430 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2431 {
2432 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
2433
2434 const size_t M( A.rows() );
2435 const size_t N( B.columns() );
2436 const size_t K( A.columns() );
2437
2438 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2439
2440 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
2441 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
2442
2443 size_t j( 0UL );
2444
2445 if( IsIntegral_v<ElementType> )
2446 {
2447 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
2448 for( size_t i=0UL; i<M; ++i )
2449 {
2450 const size_t kbegin( ( IsUpper_v<MT4> )
2451 ?( ( IsLower_v<MT5> )
2452 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2453 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2454 :( IsLower_v<MT5> ? j : 0UL ) );
2455 const size_t kend( ( IsLower_v<MT4> )
2456 ?( ( IsUpper_v<MT5> )
2457 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
2458 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
2459 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
2460
2461 SIMDType xmm1( C.load(i,j ) );
2462 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
2463 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
2464 SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
2465 SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
2466 SIMDType xmm6( C.load(i,j+SIMDSIZE*5UL) );
2467 SIMDType xmm7( C.load(i,j+SIMDSIZE*6UL) );
2468 SIMDType xmm8( C.load(i,j+SIMDSIZE*7UL) );
2469
2470 for( size_t k=kbegin; k<kend; ++k ) {
2471 const SIMDType a1( set( A(i,k) ) );
2472 xmm1 += a1 * B.load(k,j );
2473 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2474 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2475 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2476 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2477 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
2478 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
2479 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
2480 }
2481
2482 C.store( i, j , xmm1 );
2483 C.store( i, j+SIMDSIZE , xmm2 );
2484 C.store( i, j+SIMDSIZE*2UL, xmm3 );
2485 C.store( i, j+SIMDSIZE*3UL, xmm4 );
2486 C.store( i, j+SIMDSIZE*4UL, xmm5 );
2487 C.store( i, j+SIMDSIZE*5UL, xmm6 );
2488 C.store( i, j+SIMDSIZE*6UL, xmm7 );
2489 C.store( i, j+SIMDSIZE*7UL, xmm8 );
2490 }
2491 }
2492 }
2493
2494 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
2495 {
2496 size_t i( 0UL );
2497
2498 for( ; (i+2UL) <= M; i+=2UL )
2499 {
2500 const size_t kbegin( ( IsUpper_v<MT4> )
2501 ?( ( IsLower_v<MT5> )
2502 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2503 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2504 :( IsLower_v<MT5> ? j : 0UL ) );
2505 const size_t kend( ( IsLower_v<MT4> )
2506 ?( ( IsUpper_v<MT5> )
2507 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
2508 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2509 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
2510
2511 SIMDType xmm1 ( C.load(i ,j ) );
2512 SIMDType xmm2 ( C.load(i ,j+SIMDSIZE ) );
2513 SIMDType xmm3 ( C.load(i ,j+SIMDSIZE*2UL) );
2514 SIMDType xmm4 ( C.load(i ,j+SIMDSIZE*3UL) );
2515 SIMDType xmm5 ( C.load(i ,j+SIMDSIZE*4UL) );
2516 SIMDType xmm6 ( C.load(i+1UL,j ) );
2517 SIMDType xmm7 ( C.load(i+1UL,j+SIMDSIZE ) );
2518 SIMDType xmm8 ( C.load(i+1UL,j+SIMDSIZE*2UL) );
2519 SIMDType xmm9 ( C.load(i+1UL,j+SIMDSIZE*3UL) );
2520 SIMDType xmm10( C.load(i+1UL,j+SIMDSIZE*4UL) );
2521
2522 for( size_t k=kbegin; k<kend; ++k ) {
2523 const SIMDType a1( set( A(i ,k) ) );
2524 const SIMDType a2( set( A(i+1UL,k) ) );
2525 const SIMDType b1( B.load(k,j ) );
2526 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2527 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2528 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2529 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
2530 xmm1 += a1 * b1;
2531 xmm2 += a1 * b2;
2532 xmm3 += a1 * b3;
2533 xmm4 += a1 * b4;
2534 xmm5 += a1 * b5;
2535 xmm6 += a2 * b1;
2536 xmm7 += a2 * b2;
2537 xmm8 += a2 * b3;
2538 xmm9 += a2 * b4;
2539 xmm10 += a2 * b5;
2540 }
2541
2542 C.store( i , j , xmm1 );
2543 C.store( i , j+SIMDSIZE , xmm2 );
2544 C.store( i , j+SIMDSIZE*2UL, xmm3 );
2545 C.store( i , j+SIMDSIZE*3UL, xmm4 );
2546 C.store( i , j+SIMDSIZE*4UL, xmm5 );
2547 C.store( i+1UL, j , xmm6 );
2548 C.store( i+1UL, j+SIMDSIZE , xmm7 );
2549 C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
2550 C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
2551 C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
2552 }
2553
2554 if( i < M )
2555 {
2556 const size_t kbegin( ( IsUpper_v<MT4> )
2557 ?( ( IsLower_v<MT5> )
2558 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2559 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2560 :( IsLower_v<MT5> ? j : 0UL ) );
2561 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
2562
2563 SIMDType xmm1( C.load(i,j ) );
2564 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
2565 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
2566 SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
2567 SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
2568
2569 for( size_t k=kbegin; k<kend; ++k ) {
2570 const SIMDType a1( set( A(i,k) ) );
2571 xmm1 += a1 * B.load(k,j );
2572 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2573 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2574 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2575 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2576 }
2577
2578 C.store( i, j , xmm1 );
2579 C.store( i, j+SIMDSIZE , xmm2 );
2580 C.store( i, j+SIMDSIZE*2UL, xmm3 );
2581 C.store( i, j+SIMDSIZE*3UL, xmm4 );
2582 C.store( i, j+SIMDSIZE*4UL, xmm5 );
2583 }
2584 }
2585
2586 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2587 {
2588 size_t i( 0UL );
2589
2590 for( ; (i+2UL) <= M; i+=2UL )
2591 {
2592 const size_t kbegin( ( IsUpper_v<MT4> )
2593 ?( ( IsLower_v<MT5> )
2594 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2595 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2596 :( IsLower_v<MT5> ? j : 0UL ) );
2597 const size_t kend( ( IsLower_v<MT4> )
2598 ?( ( IsUpper_v<MT5> )
2599 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
2600 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2601 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
2602
2603 SIMDType xmm1( C.load(i ,j ) );
2604 SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
2605 SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
2606 SIMDType xmm4( C.load(i ,j+SIMDSIZE*3UL) );
2607 SIMDType xmm5( C.load(i+1UL,j ) );
2608 SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE ) );
2609 SIMDType xmm7( C.load(i+1UL,j+SIMDSIZE*2UL) );
2610 SIMDType xmm8( C.load(i+1UL,j+SIMDSIZE*3UL) );
2611
2612 for( size_t k=kbegin; k<kend; ++k ) {
2613 const SIMDType a1( set( A(i ,k) ) );
2614 const SIMDType a2( set( A(i+1UL,k) ) );
2615 const SIMDType b1( B.load(k,j ) );
2616 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2617 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2618 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2619 xmm1 += a1 * b1;
2620 xmm2 += a1 * b2;
2621 xmm3 += a1 * b3;
2622 xmm4 += a1 * b4;
2623 xmm5 += a2 * b1;
2624 xmm6 += a2 * b2;
2625 xmm7 += a2 * b3;
2626 xmm8 += a2 * b4;
2627 }
2628
2629 C.store( i , j , xmm1 );
2630 C.store( i , j+SIMDSIZE , xmm2 );
2631 C.store( i , j+SIMDSIZE*2UL, xmm3 );
2632 C.store( i , j+SIMDSIZE*3UL, xmm4 );
2633 C.store( i+1UL, j , xmm5 );
2634 C.store( i+1UL, j+SIMDSIZE , xmm6 );
2635 C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
2636 C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
2637 }
2638
2639 if( i < M )
2640 {
2641 const size_t kbegin( ( IsUpper_v<MT4> )
2642 ?( ( IsLower_v<MT5> )
2643 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2644 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2645 :( IsLower_v<MT5> ? j : 0UL ) );
2646 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
2647
2648 SIMDType xmm1( C.load(i,j ) );
2649 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
2650 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
2651 SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
2652
2653 for( size_t k=kbegin; k<kend; ++k ) {
2654 const SIMDType a1( set( A(i,k) ) );
2655 xmm1 += a1 * B.load(k,j );
2656 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2657 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2658 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2659 }
2660
2661 C.store( i, j , xmm1 );
2662 C.store( i, j+SIMDSIZE , xmm2 );
2663 C.store( i, j+SIMDSIZE*2UL, xmm3 );
2664 C.store( i, j+SIMDSIZE*3UL, xmm4 );
2665 }
2666 }
2667
2668 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2669 {
2670 size_t i( 0UL );
2671
2672 for( ; (i+2UL) <= M; i+=2UL )
2673 {
2674 const size_t kbegin( ( IsUpper_v<MT4> )
2675 ?( ( IsLower_v<MT5> )
2676 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2677 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2678 :( IsLower_v<MT5> ? j : 0UL ) );
2679 const size_t kend( ( IsLower_v<MT4> )
2680 ?( ( IsUpper_v<MT5> )
2681 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
2682 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2683 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
2684
2685 SIMDType xmm1( C.load(i ,j ) );
2686 SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
2687 SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
2688 SIMDType xmm4( C.load(i+1UL,j ) );
2689 SIMDType xmm5( C.load(i+1UL,j+SIMDSIZE ) );
2690 SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE*2UL) );
2691
2692 for( size_t k=kbegin; k<kend; ++k ) {
2693 const SIMDType a1( set( A(i ,k) ) );
2694 const SIMDType a2( set( A(i+1UL,k) ) );
2695 const SIMDType b1( B.load(k,j ) );
2696 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2697 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2698 xmm1 += a1 * b1;
2699 xmm2 += a1 * b2;
2700 xmm3 += a1 * b3;
2701 xmm4 += a2 * b1;
2702 xmm5 += a2 * b2;
2703 xmm6 += a2 * b3;
2704 }
2705
2706 C.store( i , j , xmm1 );
2707 C.store( i , j+SIMDSIZE , xmm2 );
2708 C.store( i , j+SIMDSIZE*2UL, xmm3 );
2709 C.store( i+1UL, j , xmm4 );
2710 C.store( i+1UL, j+SIMDSIZE , xmm5 );
2711 C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
2712 }
2713
2714 if( i < M )
2715 {
2716 const size_t kbegin( ( IsUpper_v<MT4> )
2717 ?( ( IsLower_v<MT5> )
2718 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2719 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2720 :( IsLower_v<MT5> ? j : 0UL ) );
2721 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
2722
2723 SIMDType xmm1( C.load(i,j ) );
2724 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
2725 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
2726
2727 for( size_t k=kbegin; k<kend; ++k ) {
2728 const SIMDType a1( set( A(i,k) ) );
2729 xmm1 += a1 * B.load(k,j );
2730 xmm2 += a1 * B.load(k,j+SIMDSIZE );
2731 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2732 }
2733
2734 C.store( i, j , xmm1 );
2735 C.store( i, j+SIMDSIZE , xmm2 );
2736 C.store( i, j+SIMDSIZE*2UL, xmm3 );
2737 }
2738 }
2739
2740 for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2741 {
2742 const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
2743 size_t i( LOW ? j : 0UL );
2744
2745 for( ; (i+4UL) <= iend; i+=4UL )
2746 {
2747 const size_t kbegin( ( IsUpper_v<MT4> )
2748 ?( ( IsLower_v<MT5> )
2749 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2750 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2751 :( IsLower_v<MT5> ? j : 0UL ) );
2752 const size_t kend( ( IsLower_v<MT4> )
2753 ?( ( IsUpper_v<MT5> )
2754 ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
2755 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
2756 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
2757
2758 SIMDType xmm1( C.load(i ,j ) );
2759 SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
2760 SIMDType xmm3( C.load(i+1UL,j ) );
2761 SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
2762 SIMDType xmm5( C.load(i+2UL,j ) );
2763 SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
2764 SIMDType xmm7( C.load(i+3UL,j ) );
2765 SIMDType xmm8( C.load(i+3UL,j+SIMDSIZE) );
2766
2767 for( size_t k=kbegin; k<kend; ++k ) {
2768 const SIMDType a1( set( A(i ,k) ) );
2769 const SIMDType a2( set( A(i+1UL,k) ) );
2770 const SIMDType a3( set( A(i+2UL,k) ) );
2771 const SIMDType a4( set( A(i+3UL,k) ) );
2772 const SIMDType b1( B.load(k,j ) );
2773 const SIMDType b2( B.load(k,j+SIMDSIZE) );
2774 xmm1 += a1 * b1;
2775 xmm2 += a1 * b2;
2776 xmm3 += a2 * b1;
2777 xmm4 += a2 * b2;
2778 xmm5 += a3 * b1;
2779 xmm6 += a3 * b2;
2780 xmm7 += a4 * b1;
2781 xmm8 += a4 * b2;
2782 }
2783
2784 C.store( i , j , xmm1 );
2785 C.store( i , j+SIMDSIZE, xmm2 );
2786 C.store( i+1UL, j , xmm3 );
2787 C.store( i+1UL, j+SIMDSIZE, xmm4 );
2788 C.store( i+2UL, j , xmm5 );
2789 C.store( i+2UL, j+SIMDSIZE, xmm6 );
2790 C.store( i+3UL, j , xmm7 );
2791 C.store( i+3UL, j+SIMDSIZE, xmm8 );
2792 }
2793
2794 for( ; (i+3UL) <= iend; i+=3UL )
2795 {
2796 const size_t kbegin( ( IsUpper_v<MT4> )
2797 ?( ( IsLower_v<MT5> )
2798 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2799 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2800 :( IsLower_v<MT5> ? j : 0UL ) );
2801 const size_t kend( ( IsLower_v<MT4> )
2802 ?( ( IsUpper_v<MT5> )
2803 ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
2804 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
2805 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
2806
2807 SIMDType xmm1( C.load(i ,j ) );
2808 SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
2809 SIMDType xmm3( C.load(i+1UL,j ) );
2810 SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
2811 SIMDType xmm5( C.load(i+2UL,j ) );
2812 SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
2813
2814 for( size_t k=kbegin; k<kend; ++k ) {
2815 const SIMDType a1( set( A(i ,k) ) );
2816 const SIMDType a2( set( A(i+1UL,k) ) );
2817 const SIMDType a3( set( A(i+2UL,k) ) );
2818 const SIMDType b1( B.load(k,j ) );
2819 const SIMDType b2( B.load(k,j+SIMDSIZE) );
2820 xmm1 += a1 * b1;
2821 xmm2 += a1 * b2;
2822 xmm3 += a2 * b1;
2823 xmm4 += a2 * b2;
2824 xmm5 += a3 * b1;
2825 xmm6 += a3 * b2;
2826 }
2827
2828 C.store( i , j , xmm1 );
2829 C.store( i , j+SIMDSIZE, xmm2 );
2830 C.store( i+1UL, j , xmm3 );
2831 C.store( i+1UL, j+SIMDSIZE, xmm4 );
2832 C.store( i+2UL, j , xmm5 );
2833 C.store( i+2UL, j+SIMDSIZE, xmm6 );
2834 }
2835
2836 for( ; (i+2UL) <= iend; i+=2UL )
2837 {
2838 const size_t kbegin( ( IsUpper_v<MT4> )
2839 ?( ( IsLower_v<MT5> )
2840 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2841 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2842 :( IsLower_v<MT5> ? j : 0UL ) );
2843 const size_t kend( ( IsLower_v<MT4> )
2844 ?( ( IsUpper_v<MT5> )
2845 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
2846 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2847 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
2848
2849 SIMDType xmm1( C.load(i ,j ) );
2850 SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
2851 SIMDType xmm3( C.load(i+1UL,j ) );
2852 SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
2853
2854 for( size_t k=kbegin; k<kend; ++k ) {
2855 const SIMDType a1( set( A(i ,k) ) );
2856 const SIMDType a2( set( A(i+1UL,k) ) );
2857 const SIMDType b1( B.load(k,j ) );
2858 const SIMDType b2( B.load(k,j+SIMDSIZE) );
2859 xmm1 += a1 * b1;
2860 xmm2 += a1 * b2;
2861 xmm3 += a2 * b1;
2862 xmm4 += a2 * b2;
2863 }
2864
2865 C.store( i , j , xmm1 );
2866 C.store( i , j+SIMDSIZE, xmm2 );
2867 C.store( i+1UL, j , xmm3 );
2868 C.store( i+1UL, j+SIMDSIZE, xmm4 );
2869 }
2870
2871 if( i < iend )
2872 {
2873 const size_t kbegin( ( IsUpper_v<MT4> )
2874 ?( ( IsLower_v<MT5> )
2875 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2876 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2877 :( IsLower_v<MT5> ? j : 0UL ) );
2878 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
2879
2880 SIMDType xmm1( C.load(i,j ) );
2881 SIMDType xmm2( C.load(i,j+SIMDSIZE) );
2882
2883 for( size_t k=kbegin; k<kend; ++k ) {
2884 const SIMDType a1( set( A(i,k) ) );
2885 xmm1 += a1 * B.load(k,j );
2886 xmm2 += a1 * B.load(k,j+SIMDSIZE);
2887 }
2888
2889 C.store( i, j , xmm1 );
2890 C.store( i, j+SIMDSIZE, xmm2 );
2891 }
2892 }
2893
2894 for( ; j<jpos; j+=SIMDSIZE )
2895 {
2896 const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
2897 size_t i( LOW ? j : 0UL );
2898
2899 for( ; (i+4UL) <= iend; i+=4UL )
2900 {
2901 const size_t kbegin( ( IsUpper_v<MT4> )
2902 ?( ( IsLower_v<MT5> )
2903 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2904 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2905 :( IsLower_v<MT5> ? j : 0UL ) );
2906 const size_t kend( ( IsLower_v<MT4> )
2907 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
2908 :( K ) );
2909
2910 SIMDType xmm1( C.load(i ,j) );
2911 SIMDType xmm2( C.load(i+1UL,j) );
2912 SIMDType xmm3( C.load(i+2UL,j) );
2913 SIMDType xmm4( C.load(i+3UL,j) );
2914
2915 for( size_t k=kbegin; k<kend; ++k ) {
2916 const SIMDType b1( B.load(k,j) );
2917 xmm1 += set( A(i ,k) ) * b1;
2918 xmm2 += set( A(i+1UL,k) ) * b1;
2919 xmm3 += set( A(i+2UL,k) ) * b1;
2920 xmm4 += set( A(i+3UL,k) ) * b1;
2921 }
2922
2923 C.store( i , j, xmm1 );
2924 C.store( i+1UL, j, xmm2 );
2925 C.store( i+2UL, j, xmm3 );
2926 C.store( i+3UL, j, xmm4 );
2927 }
2928
2929 for( ; (i+3UL) <= iend; i+=3UL )
2930 {
2931 const size_t kbegin( ( IsUpper_v<MT4> )
2932 ?( ( IsLower_v<MT5> )
2933 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2934 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2935 :( IsLower_v<MT5> ? j : 0UL ) );
2936 const size_t kend( ( IsLower_v<MT4> )
2937 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
2938 :( K ) );
2939
2940 SIMDType xmm1( C.load(i ,j) );
2941 SIMDType xmm2( C.load(i+1UL,j) );
2942 SIMDType xmm3( C.load(i+2UL,j) );
2943
2944 for( size_t k=kbegin; k<kend; ++k ) {
2945 const SIMDType b1( B.load(k,j) );
2946 xmm1 += set( A(i ,k) ) * b1;
2947 xmm2 += set( A(i+1UL,k) ) * b1;
2948 xmm3 += set( A(i+2UL,k) ) * b1;
2949 }
2950
2951 C.store( i , j, xmm1 );
2952 C.store( i+1UL, j, xmm2 );
2953 C.store( i+2UL, j, xmm3 );
2954 }
2955
2956 for( ; (i+2UL) <= iend; i+=2UL )
2957 {
2958 const size_t kbegin( ( IsUpper_v<MT4> )
2959 ?( ( IsLower_v<MT5> )
2960 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2961 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2962 :( IsLower_v<MT5> ? j : 0UL ) );
2963 const size_t kend( ( IsLower_v<MT4> )
2964 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
2965 :( K ) );
2966
2967 SIMDType xmm1( C.load(i ,j) );
2968 SIMDType xmm2( C.load(i+1UL,j) );
2969
2970 for( size_t k=kbegin; k<kend; ++k ) {
2971 const SIMDType b1( B.load(k,j) );
2972 xmm1 += set( A(i ,k) ) * b1;
2973 xmm2 += set( A(i+1UL,k) ) * b1;
2974 }
2975
2976 C.store( i , j, xmm1 );
2977 C.store( i+1UL, j, xmm2 );
2978 }
2979
2980 if( i < iend )
2981 {
2982 const size_t kbegin( ( IsUpper_v<MT4> )
2983 ?( ( IsLower_v<MT5> )
2984 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2985 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2986 :( IsLower_v<MT5> ? j : 0UL ) );
2987
2988 SIMDType xmm1( C.load(i,j) );
2989
2990 for( size_t k=kbegin; k<K; ++k ) {
2991 xmm1 += set( A(i,k) ) * B.load(k,j);
2992 }
2993
2994 C.store( i, j, xmm1 );
2995 }
2996 }
2997
2998 for( ; remainder && j<N; ++j )
2999 {
3000 const size_t iend( UPP ? j+1UL : M );
3001 size_t i( LOW ? j : 0UL );
3002
3003 for( ; (i+2UL) <= iend; i+=2UL )
3004 {
3005 const size_t kbegin( ( IsUpper_v<MT4> )
3006 ?( ( IsLower_v<MT5> )
3007 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3008 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3009 :( IsLower_v<MT5> ? j : 0UL ) );
3010 const size_t kend( ( IsLower_v<MT4> )
3011 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3012 :( K ) );
3013
3014 ElementType value1( C(i ,j) );
3015 ElementType value2( C(i+1UL,j) );;
3016
3017 for( size_t k=kbegin; k<kend; ++k ) {
3018 value1 += A(i ,k) * B(k,j);
3019 value2 += A(i+1UL,k) * B(k,j);
3020 }
3021
3022 C(i ,j) = value1;
3023 C(i+1UL,j) = value2;
3024 }
3025
3026 if( i < iend )
3027 {
3028 const size_t kbegin( ( IsUpper_v<MT4> )
3029 ?( ( IsLower_v<MT5> )
3030 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3031 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3032 :( IsLower_v<MT5> ? j : 0UL ) );
3033
3034 ElementType value( C(i,j) );
3035
3036 for( size_t k=kbegin; k<K; ++k ) {
3037 value += A(i,k) * B(k,j);
3038 }
3039
3040 C(i,j) = value;
3041 }
3042 }
3043 }
3045 //**********************************************************************************************
3046
3047 //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
3062 template< typename MT3 // Type of the left-hand side target matrix
3063 , typename MT4 // Type of the left-hand side matrix operand
3064 , typename MT5 > // Type of the right-hand side matrix operand
3065 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3066 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3067 {
3072
3073 const ForwardFunctor fwd;
3074
3075 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
3076 const OppositeType_t<MT4> tmp( serial( A ) );
3077 addAssign( C, fwd( tmp * B ) );
3078 }
3079 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
3080 const OppositeType_t<MT5> tmp( serial( B ) );
3081 addAssign( C, fwd( A * tmp ) );
3082 }
3083 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
3084 const OppositeType_t<MT4> tmp( serial( A ) );
3085 addAssign( C, fwd( tmp * B ) );
3086 }
3087 else {
3088 const OppositeType_t<MT5> tmp( serial( B ) );
3089 addAssign( C, fwd( A * tmp ) );
3090 }
3091 }
3093 //**********************************************************************************************
3094
3095 //**Default addition assignment to dense matrices (large matrices)******************************
3109 template< typename MT3 // Type of the left-hand side target matrix
3110 , typename MT4 // Type of the left-hand side matrix operand
3111 , typename MT5 > // Type of the right-hand side matrix operand
3112 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3113 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3114 {
3115 selectDefaultAddAssignKernel( C, A, B );
3116 }
3118 //**********************************************************************************************
3119
3120 //**Vectorized default addition assignment to dense matrices (large matrices)*******************
3135 template< typename MT3 // Type of the left-hand side target matrix
3136 , typename MT4 // Type of the left-hand side matrix operand
3137 , typename MT5 > // Type of the right-hand side matrix operand
3138 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3139 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3140 {
3141 if( LOW )
3142 lmmm( C, A, B, ElementType(1), ElementType(1) );
3143 else if( UPP )
3144 ummm( C, A, B, ElementType(1), ElementType(1) );
3145 else
3146 mmm( C, A, B, ElementType(1), ElementType(1) );
3147 }
3149 //**********************************************************************************************
3150
3151 //**BLAS-based addition assignment to dense matrices (default)**********************************
3165 template< typename MT3 // Type of the left-hand side target matrix
3166 , typename MT4 // Type of the left-hand side matrix operand
3167 , typename MT5 > // Type of the right-hand side matrix operand
3168 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3169 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3170 {
3171 selectLargeAddAssignKernel( C, A, B );
3172 }
3174 //**********************************************************************************************
3175
3176 //**BLAS-based addition assignment to dense matrices********************************************
3177#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3191 template< typename MT3 // Type of the left-hand side target matrix
3192 , typename MT4 // Type of the left-hand side matrix operand
3193 , typename MT5 > // Type of the right-hand side matrix operand
3194 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3195 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3196 {
3197 using ET = ElementType_t<MT3>;
3198
3199 if( IsTriangular_v<MT4> ) {
3200 ResultType_t<MT3> tmp( serial( B ) );
3201 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
3202 addAssign( C, tmp );
3203 }
3204 else if( IsTriangular_v<MT5> ) {
3205 ResultType_t<MT3> tmp( serial( A ) );
3206 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
3207 addAssign( C, tmp );
3208 }
3209 else {
3210 gemm( C, A, B, ET(1), ET(1) );
3211 }
3212 }
3214#endif
3215 //**********************************************************************************************
3216
3217 //**Restructuring addition assignment to column-major matrices**********************************
3232 template< typename MT > // Type of the target matrix
3233 friend inline auto addAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
3234 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3235 {
3237
3239
3240 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
3241 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
3242
3243 const ForwardFunctor fwd;
3244
3245 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
3246 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
3247
3248 addAssign( *lhs, fwd( A * B ) );
3249 }
3251 //**********************************************************************************************
3252
3253 //**Addition assignment to sparse matrices******************************************************
3254 // No special implementation for the addition assignment to sparse matrices.
3255 //**********************************************************************************************
3256
3257 //**Subtraction assignment to dense matrices****************************************************
3270 template< typename MT // Type of the target dense matrix
3271 , bool SO > // Storage order of the target dense matrix
3272 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
3273 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3274 {
3276
3277 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
3278 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
3279
3280 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3281 return;
3282 }
3283
3284 LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3285 RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3286
3287 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3288 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3289 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3290 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3291 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
3292 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
3293
3294 DMatDMatMultExpr::selectSubAssignKernel( *lhs, A, B );
3295 }
3297 //**********************************************************************************************
3298
3299 //**Subtraction assignment to dense matrices (kernel selection)*********************************
3310 template< typename MT3 // Type of the left-hand side target matrix
3311 , typename MT4 // Type of the left-hand side matrix operand
3312 , typename MT5 > // Type of the right-hand side matrix operand
3313 static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3314 {
3315 if( ( IsDiagonal_v<MT5> ) ||
3316 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
3317 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
3318 selectSmallSubAssignKernel( C, A, B );
3319 else
3320 selectBlasSubAssignKernel( C, A, B );
3321 }
3323 //**********************************************************************************************
3324
3325 //**Default subtraction assignment to dense matrices (general/general)**************************
3339 template< typename MT3 // Type of the left-hand side target matrix
3340 , typename MT4 // Type of the left-hand side matrix operand
3341 , typename MT5 > // Type of the right-hand side matrix operand
3342 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3343 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3344 {
3345 const size_t M( A.rows() );
3346 const size_t N( B.columns() );
3347 const size_t K( A.columns() );
3348
3349 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3350
3351 for( size_t i=0UL; i<M; ++i )
3352 {
3353 const size_t kbegin( ( IsUpper_v<MT4> )
3354 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3355 :( 0UL ) );
3356 const size_t kend( ( IsLower_v<MT4> )
3357 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3358 :( K ) );
3359 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3360
3361 for( size_t k=kbegin; k<kend; ++k )
3362 {
3363 const size_t jbegin( ( IsUpper_v<MT5> )
3364 ?( ( IsStrictlyUpper_v<MT5> )
3365 ?( UPP ? max(i,k+1UL) : k+1UL )
3366 :( UPP ? max(i,k) : k ) )
3367 :( UPP ? i : 0UL ) );
3368 const size_t jend( ( IsLower_v<MT5> )
3369 ?( ( IsStrictlyLower_v<MT5> )
3370 ?( LOW ? min(i+1UL,k) : k )
3371 :( LOW ? min(i,k)+1UL : k+1UL ) )
3372 :( LOW ? i+1UL : N ) );
3373
3374 if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
3375 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3376
3377 const size_t jnum( jend - jbegin );
3378 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
3379 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
3380
3381 for( size_t j=jbegin; j<jpos; j+=2UL ) {
3382 C(i,j ) -= A(i,k) * B(k,j );
3383 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3384 }
3385 if( jpos < jend ) {
3386 C(i,jpos) -= A(i,k) * B(k,jpos);
3387 }
3388 }
3389 }
3390 }
3392 //**********************************************************************************************
3393
3394 //**Default subtraction assignment to dense matrices (general/diagonal)*************************
3408 template< typename MT3 // Type of the left-hand side target matrix
3409 , typename MT4 // Type of the left-hand side matrix operand
3410 , typename MT5 > // Type of the right-hand side matrix operand
3411 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3412 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3413 {
3415
3416 const size_t M( A.rows() );
3417 const size_t N( B.columns() );
3418
3419 for( size_t i=0UL; i<M; ++i )
3420 {
3421 const size_t jbegin( ( IsUpper_v<MT4> )
3422 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3423 :( 0UL ) );
3424 const size_t jend( ( IsLower_v<MT4> )
3425 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3426 :( N ) );
3427 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3428
3429 const size_t jnum( jend - jbegin );
3430 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
3431 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
3432
3433 for( size_t j=jbegin; j<jpos; j+=2UL ) {
3434 C(i,j ) -= A(i,j ) * B(j ,j );
3435 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3436 }
3437 if( jpos < jend ) {
3438 C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3439 }
3440 }
3441 }
3443 //**********************************************************************************************
3444
3445 //**Default subtraction assignment to dense matrices (diagonal/general)*************************
3459 template< typename MT3 // Type of the left-hand side target matrix
3460 , typename MT4 // Type of the left-hand side matrix operand
3461 , typename MT5 > // Type of the right-hand side matrix operand
3462 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3463 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3464 {
3466
3467 const size_t M( A.rows() );
3468 const size_t N( B.columns() );
3469
3470 for( size_t i=0UL; i<M; ++i )
3471 {
3472 const size_t jbegin( ( IsUpper_v<MT5> )
3473 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
3474 :( 0UL ) );
3475 const size_t jend( ( IsLower_v<MT5> )
3476 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
3477 :( N ) );
3478 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3479
3480 const size_t jnum( jend - jbegin );
3481 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
3482 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
3483
3484 for( size_t j=jbegin; j<jpos; j+=2UL ) {
3485 C(i,j ) -= A(i,i) * B(i,j );
3486 C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
3487 }
3488 if( jpos < jend ) {
3489 C(i,jpos) -= A(i,i) * B(i,jpos);
3490 }
3491 }
3492 }
3494 //**********************************************************************************************
3495
3496 //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
3510 template< typename MT3 // Type of the left-hand side target matrix
3511 , typename MT4 // Type of the left-hand side matrix operand
3512 , typename MT5 > // Type of the right-hand side matrix operand
3513 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3514 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3515 {
3517
3518 for( size_t i=0UL; i<A.rows(); ++i ) {
3519 C(i,i) -= A(i,i) * B(i,i);
3520 }
3521 }
3523 //**********************************************************************************************
3524
3525 //**Default subtraction assignment to dense matrices (small matrices)***************************
3539 template< typename MT3 // Type of the left-hand side target matrix
3540 , typename MT4 // Type of the left-hand side matrix operand
3541 , typename MT5 > // Type of the right-hand side matrix operand
3542 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3543 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3544 {
3545 selectDefaultSubAssignKernel( C, A, B );
3546 }
3548 //**********************************************************************************************
3549
3550 //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
3565 template< typename MT3 // Type of the left-hand side target matrix
3566 , typename MT4 // Type of the left-hand side matrix operand
3567 , typename MT5 > // Type of the right-hand side matrix operand
3568 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3569 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3570 {
3571 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
3572
3573 const size_t M( A.rows() );
3574 const size_t N( B.columns() );
3575 const size_t K( A.columns() );
3576
3577 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3578
3579 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
3580 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
3581
3582 size_t j( 0UL );
3583
3584 if( IsIntegral_v<ElementType> )
3585 {
3586 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
3587 for( size_t i=0UL; i<M; ++i )
3588 {
3589 const size_t kbegin( ( IsUpper_v<MT4> )
3590 ?( ( IsLower_v<MT5> )
3591 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3592 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3593 :( IsLower_v<MT5> ? j : 0UL ) );
3594 const size_t kend( ( IsLower_v<MT4> )
3595 ?( ( IsUpper_v<MT5> )
3596 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
3597 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3598 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
3599
3600 SIMDType xmm1( C.load(i,j ) );
3601 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3602 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3603 SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3604 SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
3605 SIMDType xmm6( C.load(i,j+SIMDSIZE*5UL) );
3606 SIMDType xmm7( C.load(i,j+SIMDSIZE*6UL) );
3607 SIMDType xmm8( C.load(i,j+SIMDSIZE*7UL) );
3608
3609 for( size_t k=kbegin; k<kend; ++k ) {
3610 const SIMDType a1( set( A(i,k) ) );
3611 xmm1 -= a1 * B.load(k,j );
3612 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3613 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3614 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3615 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
3616 xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
3617 xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
3618 xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
3619 }
3620
3621 C.store( i, j , xmm1 );
3622 C.store( i, j+SIMDSIZE , xmm2 );
3623 C.store( i, j+SIMDSIZE*2UL, xmm3 );
3624 C.store( i, j+SIMDSIZE*3UL, xmm4 );
3625 C.store( i, j+SIMDSIZE*4UL, xmm5 );
3626 C.store( i, j+SIMDSIZE*5UL, xmm6 );
3627 C.store( i, j+SIMDSIZE*6UL, xmm7 );
3628 C.store( i, j+SIMDSIZE*7UL, xmm8 );
3629 }
3630 }
3631 }
3632
3633 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
3634 {
3635 size_t i( 0UL );
3636
3637 for( ; (i+2UL) <= M; i+=2UL )
3638 {
3639 const size_t kbegin( ( IsUpper_v<MT4> )
3640 ?( ( IsLower_v<MT5> )
3641 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3642 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3643 :( IsLower_v<MT5> ? j : 0UL ) );
3644 const size_t kend( ( IsLower_v<MT4> )
3645 ?( ( IsUpper_v<MT5> )
3646 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
3647 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3648 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
3649
3650 SIMDType xmm1 ( C.load(i ,j ) );
3651 SIMDType xmm2 ( C.load(i ,j+SIMDSIZE ) );
3652 SIMDType xmm3 ( C.load(i ,j+SIMDSIZE*2UL) );
3653 SIMDType xmm4 ( C.load(i ,j+SIMDSIZE*3UL) );
3654 SIMDType xmm5 ( C.load(i ,j+SIMDSIZE*4UL) );
3655 SIMDType xmm6 ( C.load(i+1UL,j ) );
3656 SIMDType xmm7 ( C.load(i+1UL,j+SIMDSIZE ) );
3657 SIMDType xmm8 ( C.load(i+1UL,j+SIMDSIZE*2UL) );
3658 SIMDType xmm9 ( C.load(i+1UL,j+SIMDSIZE*3UL) );
3659 SIMDType xmm10( C.load(i+1UL,j+SIMDSIZE*4UL) );
3660
3661 for( size_t k=kbegin; k<kend; ++k ) {
3662 const SIMDType a1( set( A(i ,k) ) );
3663 const SIMDType a2( set( A(i+1UL,k) ) );
3664 const SIMDType b1( B.load(k,j ) );
3665 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3666 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3667 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3668 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
3669 xmm1 -= a1 * b1;
3670 xmm2 -= a1 * b2;
3671 xmm3 -= a1 * b3;
3672 xmm4 -= a1 * b4;
3673 xmm5 -= a1 * b5;
3674 xmm6 -= a2 * b1;
3675 xmm7 -= a2 * b2;
3676 xmm8 -= a2 * b3;
3677 xmm9 -= a2 * b4;
3678 xmm10 -= a2 * b5;
3679 }
3680
3681 C.store( i , j , xmm1 );
3682 C.store( i , j+SIMDSIZE , xmm2 );
3683 C.store( i , j+SIMDSIZE*2UL, xmm3 );
3684 C.store( i , j+SIMDSIZE*3UL, xmm4 );
3685 C.store( i , j+SIMDSIZE*4UL, xmm5 );
3686 C.store( i+1UL, j , xmm6 );
3687 C.store( i+1UL, j+SIMDSIZE , xmm7 );
3688 C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
3689 C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3690 C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3691 }
3692
3693 if( i < M )
3694 {
3695 const size_t kbegin( ( IsUpper_v<MT4> )
3696 ?( ( IsLower_v<MT5> )
3697 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3698 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3699 :( IsLower_v<MT5> ? j : 0UL ) );
3700 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
3701
3702 SIMDType xmm1( C.load(i,j ) );
3703 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3704 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3705 SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3706 SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
3707
3708 for( size_t k=kbegin; k<kend; ++k ) {
3709 const SIMDType a1( set( A(i,k) ) );
3710 xmm1 -= a1 * B.load(k,j );
3711 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3712 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3713 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3714 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
3715 }
3716
3717 C.store( i, j , xmm1 );
3718 C.store( i, j+SIMDSIZE , xmm2 );
3719 C.store( i, j+SIMDSIZE*2UL, xmm3 );
3720 C.store( i, j+SIMDSIZE*3UL, xmm4 );
3721 C.store( i, j+SIMDSIZE*4UL, xmm5 );
3722 }
3723 }
3724
3725 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3726 {
3727 size_t i( 0UL );
3728
3729 for( ; (i+2UL) <= M; i+=2UL )
3730 {
3731 const size_t kbegin( ( IsUpper_v<MT4> )
3732 ?( ( IsLower_v<MT5> )
3733 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3734 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3735 :( IsLower_v<MT5> ? j : 0UL ) );
3736 const size_t kend( ( IsLower_v<MT4> )
3737 ?( ( IsUpper_v<MT5> )
3738 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
3739 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3740 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
3741
3742 SIMDType xmm1( C.load(i ,j ) );
3743 SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
3744 SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
3745 SIMDType xmm4( C.load(i ,j+SIMDSIZE*3UL) );
3746 SIMDType xmm5( C.load(i+1UL,j ) );
3747 SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE ) );
3748 SIMDType xmm7( C.load(i+1UL,j+SIMDSIZE*2UL) );
3749 SIMDType xmm8( C.load(i+1UL,j+SIMDSIZE*3UL) );
3750
3751 for( size_t k=kbegin; k<kend; ++k ) {
3752 const SIMDType a1( set( A(i ,k) ) );
3753 const SIMDType a2( set( A(i+1UL,k) ) );
3754 const SIMDType b1( B.load(k,j ) );
3755 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3756 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3757 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3758 xmm1 -= a1 * b1;
3759 xmm2 -= a1 * b2;
3760 xmm3 -= a1 * b3;
3761 xmm4 -= a1 * b4;
3762 xmm5 -= a2 * b1;
3763 xmm6 -= a2 * b2;
3764 xmm7 -= a2 * b3;
3765 xmm8 -= a2 * b4;
3766 }
3767
3768 C.store( i , j , xmm1 );
3769 C.store( i , j+SIMDSIZE , xmm2 );
3770 C.store( i , j+SIMDSIZE*2UL, xmm3 );
3771 C.store( i , j+SIMDSIZE*3UL, xmm4 );
3772 C.store( i+1UL, j , xmm5 );
3773 C.store( i+1UL, j+SIMDSIZE , xmm6 );
3774 C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3775 C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3776 }
3777
3778 if( i < M )
3779 {
3780 const size_t kbegin( ( IsUpper_v<MT4> )
3781 ?( ( IsLower_v<MT5> )
3782 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3783 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3784 :( IsLower_v<MT5> ? j : 0UL ) );
3785 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
3786
3787 SIMDType xmm1( C.load(i,j ) );
3788 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3789 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3790 SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3791
3792 for( size_t k=kbegin; k<kend; ++k ) {
3793 const SIMDType a1( set( A(i,k) ) );
3794 xmm1 -= a1 * B.load(k,j );
3795 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3796 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3797 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3798 }
3799
3800 C.store( i, j , xmm1 );
3801 C.store( i, j+SIMDSIZE , xmm2 );
3802 C.store( i, j+SIMDSIZE*2UL, xmm3 );
3803 C.store( i, j+SIMDSIZE*3UL, xmm4 );
3804 }
3805 }
3806
3807 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3808 {
3809 size_t i( 0UL );
3810
3811 for( ; (i+2UL) <= M; i+=2UL )
3812 {
3813 const size_t kbegin( ( IsUpper_v<MT4> )
3814 ?( ( IsLower_v<MT5> )
3815 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3816 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3817 :( IsLower_v<MT5> ? j : 0UL ) );
3818 const size_t kend( ( IsLower_v<MT4> )
3819 ?( ( IsUpper_v<MT5> )
3820 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
3821 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3822 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
3823
3824 SIMDType xmm1( C.load(i ,j ) );
3825 SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
3826 SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
3827 SIMDType xmm4( C.load(i+1UL,j ) );
3828 SIMDType xmm5( C.load(i+1UL,j+SIMDSIZE ) );
3829 SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE*2UL) );
3830
3831 for( size_t k=kbegin; k<kend; ++k ) {
3832 const SIMDType a1( set( A(i ,k) ) );
3833 const SIMDType a2( set( A(i+1UL,k) ) );
3834 const SIMDType b1( B.load(k,j ) );
3835 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3836 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3837 xmm1 -= a1 * b1;
3838 xmm2 -= a1 * b2;
3839 xmm3 -= a1 * b3;
3840 xmm4 -= a2 * b1;
3841 xmm5 -= a2 * b2;
3842 xmm6 -= a2 * b3;
3843 }
3844
3845 C.store( i , j , xmm1 );
3846 C.store( i , j+SIMDSIZE , xmm2 );
3847 C.store( i , j+SIMDSIZE*2UL, xmm3 );
3848 C.store( i+1UL, j , xmm4 );
3849 C.store( i+1UL, j+SIMDSIZE , xmm5 );
3850 C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
3851 }
3852
3853 if( i < M )
3854 {
3855 const size_t kbegin( ( IsUpper_v<MT4> )
3856 ?( ( IsLower_v<MT5> )
3857 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3858 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3859 :( IsLower_v<MT5> ? j : 0UL ) );
3860 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
3861
3862 SIMDType xmm1( C.load(i,j ) );
3863 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3864 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3865
3866 for( size_t k=kbegin; k<kend; ++k ) {
3867 const SIMDType a1( set( A(i,k) ) );
3868 xmm1 -= a1 * B.load(k,j );
3869 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3870 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3871 }
3872
3873 C.store( i, j , xmm1 );
3874 C.store( i, j+SIMDSIZE , xmm2 );
3875 C.store( i, j+SIMDSIZE*2UL, xmm3 );
3876 }
3877 }
3878
3879 for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3880 {
3881 const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
3882 size_t i( LOW ? j : 0UL );
3883
3884 for( ; (i+4UL) <= iend; i+=4UL )
3885 {
3886 const size_t kbegin( ( IsUpper_v<MT4> )
3887 ?( ( IsLower_v<MT5> )
3888 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3889 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3890 :( IsLower_v<MT5> ? j : 0UL ) );
3891 const size_t kend( ( IsLower_v<MT4> )
3892 ?( ( IsUpper_v<MT5> )
3893 ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
3894 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
3895 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3896
3897 SIMDType xmm1( C.load(i ,j ) );
3898 SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3899 SIMDType xmm3( C.load(i+1UL,j ) );
3900 SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3901 SIMDType xmm5( C.load(i+2UL,j ) );
3902 SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
3903 SIMDType xmm7( C.load(i+3UL,j ) );
3904 SIMDType xmm8( C.load(i+3UL,j+SIMDSIZE) );
3905
3906 for( size_t k=kbegin; k<kend; ++k ) {
3907 const SIMDType a1( set( A(i ,k) ) );
3908 const SIMDType a2( set( A(i+1UL,k) ) );
3909 const SIMDType a3( set( A(i+2UL,k) ) );
3910 const SIMDType a4( set( A(i+3UL,k) ) );
3911 const SIMDType b1( B.load(k,j ) );
3912 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3913 xmm1 -= a1 * b1;
3914 xmm2 -= a1 * b2;
3915 xmm3 -= a2 * b1;
3916 xmm4 -= a2 * b2;
3917 xmm5 -= a3 * b1;
3918 xmm6 -= a3 * b2;
3919 xmm7 -= a4 * b1;
3920 xmm8 -= a4 * b2;
3921 }
3922
3923 C.store( i , j , xmm1 );
3924 C.store( i , j+SIMDSIZE, xmm2 );
3925 C.store( i+1UL, j , xmm3 );
3926 C.store( i+1UL, j+SIMDSIZE, xmm4 );
3927 C.store( i+2UL, j , xmm5 );
3928 C.store( i+2UL, j+SIMDSIZE, xmm6 );
3929 C.store( i+3UL, j , xmm7 );
3930 C.store( i+3UL, j+SIMDSIZE, xmm8 );
3931 }
3932
3933 for( ; (i+3UL) <= iend; i+=3UL )
3934 {
3935 const size_t kbegin( ( IsUpper_v<MT4> )
3936 ?( ( IsLower_v<MT5> )
3937 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3938 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3939 :( IsLower_v<MT5> ? j : 0UL ) );
3940 const size_t kend( ( IsLower_v<MT4> )
3941 ?( ( IsUpper_v<MT5> )
3942 ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
3943 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
3944 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3945
3946 SIMDType xmm1( C.load(i ,j ) );
3947 SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3948 SIMDType xmm3( C.load(i+1UL,j ) );
3949 SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3950 SIMDType xmm5( C.load(i+2UL,j ) );
3951 SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
3952
3953 for( size_t k=kbegin; k<kend; ++k ) {
3954 const SIMDType a1( set( A(i ,k) ) );
3955 const SIMDType a2( set( A(i+1UL,k) ) );
3956 const SIMDType a3( set( A(i+2UL,k) ) );
3957 const SIMDType b1( B.load(k,j ) );
3958 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3959 xmm1 -= a1 * b1;
3960 xmm2 -= a1 * b2;
3961 xmm3 -= a2 * b1;
3962 xmm4 -= a2 * b2;
3963 xmm5 -= a3 * b1;
3964 xmm6 -= a3 * b2;
3965 }
3966
3967 C.store( i , j , xmm1 );
3968 C.store( i , j+SIMDSIZE, xmm2 );
3969 C.store( i+1UL, j , xmm3 );
3970 C.store( i+1UL, j+SIMDSIZE, xmm4 );
3971 C.store( i+2UL, j , xmm5 );
3972 C.store( i+2UL, j+SIMDSIZE, xmm6 );
3973 }
3974
3975 for( ; (i+2UL) <= iend; i+=2UL )
3976 {
3977 const size_t kbegin( ( IsUpper_v<MT4> )
3978 ?( ( IsLower_v<MT5> )
3979 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3980 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3981 :( IsLower_v<MT5> ? j : 0UL ) );
3982 const size_t kend( ( IsLower_v<MT4> )
3983 ?( ( IsUpper_v<MT5> )
3984 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
3985 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3986 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3987
3988 SIMDType xmm1( C.load(i ,j ) );
3989 SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3990 SIMDType xmm3( C.load(i+1UL,j ) );
3991 SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3992
3993 for( size_t k=kbegin; k<kend; ++k ) {
3994 const SIMDType a1( set( A(i ,k) ) );
3995 const SIMDType a2( set( A(i+1UL,k) ) );
3996 const SIMDType b1( B.load(k,j ) );
3997 const SIMDType b2( B.load(k,j+SIMDSIZE) );
3998 xmm1 -= a1 * b1;
3999 xmm2 -= a1 * b2;
4000 xmm3 -= a2 * b1;
4001 xmm4 -= a2 * b2;
4002 }
4003
4004 C.store( i , j , xmm1 );
4005 C.store( i , j+SIMDSIZE, xmm2 );
4006 C.store( i+1UL, j , xmm3 );
4007 C.store( i+1UL, j+SIMDSIZE, xmm4 );
4008 }
4009
4010 if( i < iend )
4011 {
4012 const size_t kbegin( ( IsUpper_v<MT4> )
4013 ?( ( IsLower_v<MT5> )
4014 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4015 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4016 :( IsLower_v<MT5> ? j : 0UL ) );
4017 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
4018
4019 SIMDType xmm1( C.load(i,j ) );
4020 SIMDType xmm2( C.load(i,j+SIMDSIZE) );
4021
4022 for( size_t k=kbegin; k<kend; ++k ) {
4023 const SIMDType a1( set( A(i,k) ) );
4024 xmm1 -= a1 * B.load(k,j );
4025 xmm2 -= a1 * B.load(k,j+SIMDSIZE);
4026 }
4027
4028 C.store( i, j , xmm1 );
4029 C.store( i, j+SIMDSIZE, xmm2 );
4030 }
4031 }
4032
4033 for( ; j<jpos; j+=SIMDSIZE )
4034 {
4035 const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
4036 size_t i( LOW ? j : 0UL );
4037
4038 for( ; (i+4UL) <= iend; i+=4UL )
4039 {
4040 const size_t kbegin( ( IsUpper_v<MT4> )
4041 ?( ( IsLower_v<MT5> )
4042 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4043 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4044 :( IsLower_v<MT5> ? j : 0UL ) );
4045 const size_t kend( ( IsLower_v<MT4> )
4046 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
4047 :( K ) );
4048
4049 SIMDType xmm1( C.load(i ,j) );
4050 SIMDType xmm2( C.load(i+1UL,j) );
4051 SIMDType xmm3( C.load(i+2UL,j) );
4052 SIMDType xmm4( C.load(i+3UL,j) );
4053
4054 for( size_t k=kbegin; k<kend; ++k ) {
4055 const SIMDType b1( B.load(k,j) );
4056 xmm1 -= set( A(i ,k) ) * b1;
4057 xmm2 -= set( A(i+1UL,k) ) * b1;
4058 xmm3 -= set( A(i+2UL,k) ) * b1;
4059 xmm4 -= set( A(i+3UL,k) ) * b1;
4060 }
4061
4062 C.store( i , j, xmm1 );
4063 C.store( i+1UL, j, xmm2 );
4064 C.store( i+2UL, j, xmm3 );
4065 C.store( i+3UL, j, xmm4 );
4066 }
4067
4068 for( ; (i+3UL) <= iend; i+=3UL )
4069 {
4070 const size_t kbegin( ( IsUpper_v<MT4> )
4071 ?( ( IsLower_v<MT5> )
4072 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4073 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4074 :( IsLower_v<MT5> ? j : 0UL ) );
4075 const size_t kend( ( IsLower_v<MT4> )
4076 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
4077 :( K ) );
4078
4079 SIMDType xmm1( C.load(i ,j) );
4080 SIMDType xmm2( C.load(i+1UL,j) );
4081 SIMDType xmm3( C.load(i+2UL,j) );
4082
4083 for( size_t k=kbegin; k<kend; ++k ) {
4084 const SIMDType b1( B.load(k,j) );
4085 xmm1 -= set( A(i ,k) ) * b1;
4086 xmm2 -= set( A(i+1UL,k) ) * b1;
4087 xmm3 -= set( A(i+2UL,k) ) * b1;
4088 }
4089
4090 C.store( i , j, xmm1 );
4091 C.store( i+1UL, j, xmm2 );
4092 C.store( i+2UL, j, xmm3 );
4093 }
4094
4095 for( ; (i+2UL) <= iend; i+=2UL )
4096 {
4097 const size_t kbegin( ( IsUpper_v<MT4> )
4098 ?( ( IsLower_v<MT5> )
4099 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4100 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4101 :( IsLower_v<MT5> ? j : 0UL ) );
4102 const size_t kend( ( IsLower_v<MT4> )
4103 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
4104 :( K ) );
4105
4106 SIMDType xmm1( C.load(i ,j) );
4107 SIMDType xmm2( C.load(i+1UL,j) );
4108
4109 for( size_t k=kbegin; k<kend; ++k ) {
4110 const SIMDType b1( B.load(k,j) );
4111 xmm1 -= set( A(i ,k) ) * b1;
4112 xmm2 -= set( A(i+1UL,k) ) * b1;
4113 }
4114
4115 C.store( i , j, xmm1 );
4116 C.store( i+1UL, j, xmm2 );
4117 }
4118
4119 if( i < iend )
4120 {
4121 const size_t kbegin( ( IsUpper_v<MT4> )
4122 ?( ( IsLower_v<MT5> )
4123 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4124 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4125 :( IsLower_v<MT5> ? j : 0UL ) );
4126
4127 SIMDType xmm1( C.load(i,j) );
4128
4129 for( size_t k=kbegin; k<K; ++k ) {
4130 xmm1 -= set( A(i,k) ) * B.load(k,j);
4131 }
4132
4133 C.store( i, j, xmm1 );
4134 }
4135 }
4136
4137 for( ; remainder && j<N; ++j )
4138 {
4139 const size_t iend( UPP ? j+1UL : M );
4140 size_t i( LOW ? j : 0UL );
4141
4142 for( ; (i+2UL) <= iend; i+=2UL )
4143 {
4144 const size_t kbegin( ( IsUpper_v<MT4> )
4145 ?( ( IsLower_v<MT5> )
4146 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4147 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4148 :( IsLower_v<MT5> ? j : 0UL ) );
4149 const size_t kend( ( IsLower_v<MT4> )
4150 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
4151 :( K ) );
4152
4153 ElementType value1( C(i ,j) );
4154 ElementType value2( C(i+1UL,j) );
4155
4156 for( size_t k=kbegin; k<kend; ++k ) {
4157 value1 -= A(i ,k) * B(k,j);
4158 value2 -= A(i+1UL,k) * B(k,j);
4159 }
4160
4161 C(i ,j) = value1;
4162 C(i+1UL,j) = value2;
4163 }
4164
4165 if( i < iend )
4166 {
4167 const size_t kbegin( ( IsUpper_v<MT4> )
4168 ?( ( IsLower_v<MT5> )
4169 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4170 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4171 :( IsLower_v<MT5> ? j : 0UL ) );
4172
4173 ElementType value( C(i,j) );
4174
4175 for( size_t k=kbegin; k<K; ++k ) {
4176 value -= A(i,k) * B(k,j);
4177 }
4178
4179 C(i,j) = value;
4180 }
4181 }
4182 }
4184 //**********************************************************************************************
4185
4186 //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
4201 template< typename MT3 // Type of the left-hand side target matrix
4202 , typename MT4 // Type of the left-hand side matrix operand
4203 , typename MT5 > // Type of the right-hand side matrix operand
4204 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4205 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4206 {
4211
4212 const ForwardFunctor fwd;
4213
4214 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
4215 const OppositeType_t<MT4> tmp( serial( A ) );
4216 subAssign( C, fwd( tmp * B ) );
4217 }
4218 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
4219 const OppositeType_t<MT5> tmp( serial( B ) );
4220 subAssign( C, fwd( A * tmp ) );
4221 }
4222 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
4223 const OppositeType_t<MT4> tmp( serial( A ) );
4224 subAssign( C, fwd( tmp * B ) );
4225 }
4226 else {
4227 const OppositeType_t<MT5> tmp( serial( B ) );
4228 subAssign( C, fwd( A * tmp ) );
4229 }
4230 }
4232 //**********************************************************************************************
4233
4234 //**Default subtraction assignment to dense matrices (large matrices)***************************
4248 template< typename MT3 // Type of the left-hand side target matrix
4249 , typename MT4 // Type of the left-hand side matrix operand
4250 , typename MT5 > // Type of the right-hand side matrix operand
4251 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4252 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4253 {
4254 selectDefaultSubAssignKernel( C, A, B );
4255 }
4257 //**********************************************************************************************
4258
4259 //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
4274 template< typename MT3 // Type of the left-hand side target matrix
4275 , typename MT4 // Type of the left-hand side matrix operand
4276 , typename MT5 > // Type of the right-hand side matrix operand
4277 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4278 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4279 {
4280 if( LOW )
4281 lmmm( C, A, B, ElementType(-1), ElementType(1) );
4282 else if( UPP )
4283 ummm( C, A, B, ElementType(-1), ElementType(1) );
4284 else
4285 mmm( C, A, B, ElementType(-1), ElementType(1) );
4286 }
4288 //**********************************************************************************************
4289
4290 //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4304 template< typename MT3 // Type of the left-hand side target matrix
4305 , typename MT4 // Type of the left-hand side matrix operand
4306 , typename MT5 > // Type of the right-hand side matrix operand
4307 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4308 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4309 {
4310 selectLargeSubAssignKernel( C, A, B );
4311 }
4313 //**********************************************************************************************
4314
4315 //**BLAS-based subraction assignment to dense matrices******************************************
4316#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4330 template< typename MT3 // Type of the left-hand side target matrix
4331 , typename MT4 // Type of the left-hand side matrix operand
4332 , typename MT5 > // Type of the right-hand side matrix operand
4333 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4334 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4335 {
4336 using ET = ElementType_t<MT3>;
4337
4338 if( IsTriangular_v<MT4> ) {
4339 ResultType_t<MT3> tmp( serial( B ) );
4340 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4341 subAssign( C, tmp );
4342 }
4343 else if( IsTriangular_v<MT5> ) {
4344 ResultType_t<MT3> tmp( serial( A ) );
4345 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4346 subAssign( C, tmp );
4347 }
4348 else {
4349 gemm( C, A, B, ET(-1), ET(1) );
4350 }
4351 }
4353#endif
4354 //**********************************************************************************************
4355
4356 //**Restructuring subtraction assignment to column-major matrices*******************************
4371 template< typename MT > // Type of the target matrix
4372 friend inline auto subAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4373 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4374 {
4376
4378
4379 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4380 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4381
4382 const ForwardFunctor fwd;
4383
4384 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4385 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4386
4387 subAssign( *lhs, fwd( A * B ) );
4388 }
4390 //**********************************************************************************************
4391
4392 //**Subtraction assignment to sparse matrices***************************************************
4393 // No special implementation for the subtraction assignment to sparse matrices.
4394 //**********************************************************************************************
4395
4396 //**Schur product assignment to dense matrices**************************************************
4409 template< typename MT // Type of the target dense matrix
4410 , bool SO > // Storage order of the target dense matrix
4411 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4412 {
4414
4418
4419 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4420 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4421
4422 const ResultType tmp( serial( rhs ) );
4423 schurAssign( *lhs, tmp );
4424 }
4426 //**********************************************************************************************
4427
4428 //**Schur product assignment to sparse matrices*************************************************
4429 // No special implementation for the Schur product assignment to sparse matrices.
4430 //**********************************************************************************************
4431
4432 //**Multiplication assignment to dense matrices*************************************************
4433 // No special implementation for the multiplication assignment to dense matrices.
4434 //**********************************************************************************************
4435
4436 //**Multiplication assignment to sparse matrices************************************************
4437 // No special implementation for the multiplication assignment to sparse matrices.
4438 //**********************************************************************************************
4439
4440 //**SMP assignment to dense matrices************************************************************
4455 template< typename MT // Type of the target dense matrix
4456 , bool SO > // Storage order of the target dense matrix
4457 friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4458 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4459 {
4461
4462 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4463 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4464
4465 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
4466 return;
4467 }
4468 else if( rhs.lhs_.columns() == 0UL ) {
4469 reset( *lhs );
4470 return;
4471 }
4472
4473 LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4474 RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4475
4476 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4477 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4478 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4479 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4480 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
4481 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
4482
4483 smpAssign( *lhs, A * B );
4484 }
4486 //**********************************************************************************************
4487
4488 //**SMP assignment to sparse matrices***********************************************************
4503 template< typename MT // Type of the target sparse matrix
4504 , bool SO > // Storage order of the target sparse matrix
4505 friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4506 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4507 {
4509
4510 using TmpType = If_t< SO, OppositeType, ResultType >;
4511
4518
4519 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4520 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4521
4522 const ForwardFunctor fwd;
4523
4524 const TmpType tmp( rhs );
4525 smpAssign( *lhs, fwd( tmp ) );
4526 }
4528 //**********************************************************************************************
4529
4530 //**Restructuring SMP assignment to column-major matrices***************************************
4545 template< typename MT > // Type of the target matrix
4546 friend inline auto smpAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4547 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4548 {
4550
4552
4553 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4554 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4555
4556 const ForwardFunctor fwd;
4557
4558 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4559 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4560
4561 smpAssign( *lhs, fwd( A * B ) );
4562 }
4564 //**********************************************************************************************
4565
4566 //**SMP addition assignment to dense matrices***************************************************
4582 template< typename MT // Type of the target dense matrix
4583 , bool SO > // Storage order of the target dense matrix
4584 friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4585 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4586 {
4588
4589 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4590 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4591
4592 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4593 return;
4594 }
4595
4596 LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4597 RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4598
4599 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4600 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4601 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4602 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4603 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
4604 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
4605
4606 smpAddAssign( *lhs, A * B );
4607 }
4609 //**********************************************************************************************
4610
4611 //**Restructuring SMP addition assignment to column-major matrices******************************
4626 template< typename MT > // Type of the target matrix
4627 friend inline auto smpAddAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4628 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4629 {
4631
4633
4634 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4635 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4636
4637 const ForwardFunctor fwd;
4638
4639 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4640 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4641
4642 smpAddAssign( *lhs, fwd( A * B ) );
4643 }
4645 //**********************************************************************************************
4646
4647 //**SMP addition assignment to sparse matrices**************************************************
4648 // No special implementation for the SMP addition assignment to sparse matrices.
4649 //**********************************************************************************************
4650
4651 //**SMP subtraction assignment to dense matrices************************************************
4667 template< typename MT // Type of the target dense matrix
4668 , bool SO > // Storage order of the target dense matrix
4669 friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4670 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4671 {
4673
4674 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4675 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4676
4677 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4678 return;
4679 }
4680
4681 LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4682 RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4683
4684 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4685 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4686 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4687 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4688 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
4689 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
4690
4691 smpSubAssign( *lhs, A * B );
4692 }
4694 //**********************************************************************************************
4695
4696 //**Restructuring SMP subtraction assignment to column-major matrices***************************
4711 template< typename MT > // Type of the target matrix
4712 friend inline auto smpSubAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4713 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4714 {
4716
4718
4719 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4720 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4721
4722 const ForwardFunctor fwd;
4723
4724 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4725 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4726
4727 smpSubAssign( *lhs, fwd( A * B ) );
4728 }
4730 //**********************************************************************************************
4731
4732 //**SMP subtraction assignment to sparse matrices***********************************************
4733 // No special implementation for the SMP subtraction assignment to sparse matrices.
4734 //**********************************************************************************************
4735
4736 //**SMP Schur product assignment to dense matrices**********************************************
4749 template< typename MT // Type of the target dense matrix
4750 , bool SO > // Storage order of the target dense matrix
4751 friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4752 {
4754
4758
4759 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4760 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4761
4762 const ResultType tmp( rhs );
4763 smpSchurAssign( *lhs, tmp );
4764 }
4766 //**********************************************************************************************
4767
4768 //**SMP Schur product assignment to sparse matrices*********************************************
4769 // No special implementation for the SMP Schur product assignment to sparse matrices.
4770 //**********************************************************************************************
4771
4772 //**SMP multiplication assignment to dense matrices*********************************************
4773 // No special implementation for the SMP multiplication assignment to dense matrices.
4774 //**********************************************************************************************
4775
4776 //**SMP multiplication assignment to sparse matrices********************************************
4777 // No special implementation for the SMP multiplication assignment to sparse matrices.
4778 //**********************************************************************************************
4779
4780 //**Compile time checks*************************************************************************
4788 //**********************************************************************************************
4789};
4790//*************************************************************************************************
4791
4792
4793
4794
4795//=================================================================================================
4796//
4797// DMATSCALARMULTEXPR SPECIALIZATION
4798//
4799//=================================================================================================
4800
4801//*************************************************************************************************
4809template< typename MT1 // Type of the left-hand side dense matrix
4810 , typename MT2 // Type of the right-hand side dense matrix
4811 , bool SF // Symmetry flag
4812 , bool HF // Hermitian flag
4813 , bool LF // Lower flag
4814 , bool UF // Upper flag
4815 , typename ST > // Type of the right-hand side scalar value
4816class DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
4817 : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4818 , private Computation
4819{
4820 private:
4821 //**Type definitions****************************************************************************
4823 using MMM = DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4824
4825 using RES = ResultType_t<MMM>;
4826 using RT1 = ResultType_t<MT1>;
4827 using RT2 = ResultType_t<MT2>;
4828 using ET1 = ElementType_t<RT1>;
4829 using ET2 = ElementType_t<RT2>;
4830 using CT1 = CompositeType_t<MT1>;
4831 using CT2 = CompositeType_t<MT2>;
4832 //**********************************************************************************************
4833
4834 //**********************************************************************************************
4836 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4837 //**********************************************************************************************
4838
4839 //**********************************************************************************************
4841 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4842 //**********************************************************************************************
4843
4844 //**********************************************************************************************
4845 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
4846 static constexpr bool HERM = ( HF && !( LF || UF ) );
4847 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4848 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4849 //**********************************************************************************************
4850
4851 //**********************************************************************************************
4853
4857 template< typename T1, typename T2, typename T3 >
4858 static constexpr bool CanExploitSymmetry_v =
4859 ( IsColumnMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
4860 //**********************************************************************************************
4861
4862 //**********************************************************************************************
4864
4867 template< typename T1, typename T2, typename T3 >
4868 static constexpr bool IsEvaluationRequired_v =
4869 ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
4870 //**********************************************************************************************
4871
4872 //**********************************************************************************************
4874
4876 template< typename T1, typename T2, typename T3, typename T4 >
4877 static constexpr bool UseBlasKernel_v =
4878 ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4879 !SYM && !HERM && !LOW && !UPP &&
4880 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4881 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4882 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4883 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4884 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4885 IsBLASCompatible_v< ElementType_t<T1> > &&
4886 IsBLASCompatible_v< ElementType_t<T2> > &&
4887 IsBLASCompatible_v< ElementType_t<T3> > &&
4888 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4889 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4890 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4891 //**********************************************************************************************
4892
4893 //**********************************************************************************************
4895
4897 template< typename T1, typename T2, typename T3, typename T4 >
4898 static constexpr bool UseVectorizedDefaultKernel_v =
4899 ( useOptimizedKernels &&
4900 !IsDiagonal_v<T3> &&
4901 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4902 IsSIMDCombinable_v< ElementType_t<T1>
4903 , ElementType_t<T2>
4904 , ElementType_t<T3>
4905 , T4 > &&
4906 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
4907 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
4908 //**********************************************************************************************
4909
4910 //**********************************************************************************************
4912
4914 using ForwardFunctor = If_t< HERM
4915 , DeclHerm
4916 , If_t< SYM
4917 , DeclSym
4918 , If_t< LOW
4919 , If_t< UPP
4920 , DeclDiag
4921 , DeclLow >
4922 , If_t< UPP
4923 , DeclUpp
4924 , Noop > > > >;
4925 //**********************************************************************************************
4926
4927 public:
4928 //**Type definitions****************************************************************************
4930 using This = DMatScalarMultExpr<MMM,ST,false>;
4931
4933 using BaseType = MatScalarMultExpr< DenseMatrix<This,false> >;
4934
4936 using ResultType = typename If_t< HERM
4937 , DeclHermTrait< MultTrait_t<RES,ST> >
4938 , If_t< SYM
4939 , DeclSymTrait< MultTrait_t<RES,ST> >
4940 , If_t< LOW
4941 , If_t< UPP
4942 , DeclDiagTrait< MultTrait_t<RES,ST> >
4943 , DeclLowTrait< MultTrait_t<RES,ST> > >
4944 , If_t< UPP
4945 , DeclUppTrait< MultTrait_t<RES,ST> >
4946 , MultTrait<RES,ST> > > > >::Type;
4947
4948 using OppositeType = OppositeType_t<ResultType>;
4949 using TransposeType = TransposeType_t<ResultType>;
4950 using ElementType = ElementType_t<ResultType>;
4951 using SIMDType = SIMDTrait_t<ElementType>;
4952 using ReturnType = const ElementType;
4953 using CompositeType = const ResultType;
4954
4956 using LeftOperand = const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4957
4959 using RightOperand = ST;
4960
4962 using LT = If_t< evaluateLeft, const RT1, CT1 >;
4963
4965 using RT = If_t< evaluateRight, const RT2, CT2 >;
4966 //**********************************************************************************************
4967
4968 //**Compilation flags***************************************************************************
4970 static constexpr bool simdEnabled =
4971 ( !IsDiagonal_v<MT2> &&
4972 MT1::simdEnabled && MT2::simdEnabled &&
4973 IsSIMDCombinable_v<ET1,ET2,ST> &&
4974 HasSIMDAdd_v<ET1,ET2> &&
4975 HasSIMDMult_v<ET1,ET2> );
4976
4978 static constexpr bool smpAssignable =
4979 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
4980 //**********************************************************************************************
4981
4982 //**SIMD properties*****************************************************************************
4984 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
4985 //**********************************************************************************************
4986
4987 //**Constructor*********************************************************************************
4993 inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4994 : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4995 , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4996 {}
4997 //**********************************************************************************************
4998
4999 //**Access operator*****************************************************************************
5006 inline ReturnType operator()( size_t i, size_t j ) const {
5007 BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
5008 BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
5009 return matrix_(i,j) * scalar_;
5010 }
5011 //**********************************************************************************************
5012
5013 //**At function*********************************************************************************
5021 inline ReturnType at( size_t i, size_t j ) const {
5022 if( i >= matrix_.rows() ) {
5023 BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
5024 }
5025 if( j >= matrix_.columns() ) {
5026 BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
5027 }
5028 return (*this)(i,j);
5029 }
5030 //**********************************************************************************************
5031
5032 //**Rows function*******************************************************************************
5037 inline size_t rows() const {
5038 return matrix_.rows();
5039 }
5040 //**********************************************************************************************
5041
5042 //**Columns function****************************************************************************
5047 inline size_t columns() const {
5048 return matrix_.columns();
5049 }
5050 //**********************************************************************************************
5051
5052 //**Left operand access*************************************************************************
5057 inline LeftOperand leftOperand() const {
5058 return matrix_;
5059 }
5060 //**********************************************************************************************
5061
5062 //**Right operand access************************************************************************
5067 inline RightOperand rightOperand() const {
5068 return scalar_;
5069 }
5070 //**********************************************************************************************
5071
5072 //**********************************************************************************************
5078 template< typename T >
5079 inline bool canAlias( const T* alias ) const {
5080 return matrix_.canAlias( alias );
5081 }
5082 //**********************************************************************************************
5083
5084 //**********************************************************************************************
5090 template< typename T >
5091 inline bool isAliased( const T* alias ) const {
5092 return matrix_.isAliased( alias );
5093 }
5094 //**********************************************************************************************
5095
5096 //**********************************************************************************************
5101 inline bool isAligned() const {
5102 return matrix_.isAligned();
5103 }
5104 //**********************************************************************************************
5105
5106 //**********************************************************************************************
5111 inline bool canSMPAssign() const noexcept {
5112 return ( !BLAZE_BLAS_MODE ||
5113 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
5115 ( rows() * columns() < DMATDMATMULT_THRESHOLD ) ) &&
5116 ( rows() * columns() >= SMP_DMATDMATMULT_THRESHOLD );
5117 }
5118 //**********************************************************************************************
5119
5120 private:
5121 //**Member variables****************************************************************************
5124 //**********************************************************************************************
5125
5126 //**Assignment to dense matrices****************************************************************
5138 template< typename MT // Type of the target dense matrix
5139 , bool SO > // Storage order of the target dense matrix
5140 friend inline auto assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5141 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
5142 {
5144
5145 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
5146 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
5147
5148 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
5149 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
5150
5151 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
5152 return;
5153 }
5154 else if( left.columns() == 0UL ) {
5155 reset( *lhs );
5156 return;
5157 }
5158
5159 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
5160 RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
5161
5162 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5163 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
5164 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
5165 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
5166 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
5167 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
5168
5169 DMatScalarMultExpr::selectAssignKernel( *lhs, A, B, rhs.scalar_ );
5170 }
5171 //**********************************************************************************************
5172
5173 //**Assignment to dense matrices (kernel selection)*********************************************
5184 template< typename MT3 // Type of the left-hand side target matrix
5185 , typename MT4 // Type of the left-hand side matrix operand
5186 , typename MT5 // Type of the right-hand side matrix operand
5187 , typename ST2 > // Type of the scalar value
5188 static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5189 {
5190 if( ( IsDiagonal_v<MT5> ) ||
5191 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
5192 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
5193 selectSmallAssignKernel( C, A, B, scalar );
5194 else
5195 selectBlasAssignKernel( C, A, B, scalar );
5196 }
5197 //**********************************************************************************************
5198
5199 //**Default assignment to dense matrices (general/general)**************************************
5213 template< typename MT3 // Type of the left-hand side target matrix
5214 , typename MT4 // Type of the left-hand side matrix operand
5215 , typename MT5 // Type of the right-hand side matrix operand
5216 , typename ST2 > // Type of the scalar value
5217 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5218 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5219 {
5220 const size_t M( A.rows() );
5221 const size_t N( B.columns() );
5222 const size_t K( A.columns() );
5223
5224 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5225
5226 for( size_t i=0UL; i<M; ++i )
5227 {
5228 const size_t kbegin( ( IsUpper_v<MT4> )
5229 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5230 :( 0UL ) );
5231 const size_t kend( ( IsLower_v<MT4> )
5232 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5233 :( K ) );
5234 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
5235
5236 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
5237 for( size_t j=0UL; j<N; ++j ) {
5238 reset( C(i,j) );
5239 }
5240 continue;
5241 }
5242
5243 {
5244 const size_t jbegin( ( IsUpper_v<MT5> )
5245 ?( ( IsStrictlyUpper_v<MT5> )
5246 ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
5247 :( UPP ? max(i,kbegin) : kbegin ) )
5248 :( UPP ? i : 0UL ) );
5249 const size_t jend( ( IsLower_v<MT5> )
5250 ?( ( IsStrictlyLower_v<MT5> )
5251 ?( LOW ? min(i+1UL,kbegin) : kbegin )
5252 :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
5253 :( LOW ? i+1UL : N ) );
5254
5255 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
5256 for( size_t j=0UL; j<jbegin; ++j ) {
5257 reset( C(i,j) );
5258 }
5259 }
5260 else if( IsStrictlyUpper_v<MT5> ) {
5261 reset( C(i,0UL) );
5262 }
5263 for( size_t j=jbegin; j<jend; ++j ) {
5264 C(i,j) = A(i,kbegin) * B(kbegin,j);
5265 }
5266 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
5267 for( size_t j=jend; j<N; ++j ) {
5268 reset( C(i,j) );
5269 }
5270 }
5271 else if( IsStrictlyLower_v<MT5> ) {
5272 reset( C(i,N-1UL) );
5273 }
5274 }
5275
5276 for( size_t k=kbegin+1UL; k<kend; ++k )
5277 {
5278 const size_t jbegin( ( IsUpper_v<MT5> )
5279 ?( ( IsStrictlyUpper_v<MT5> )
5280 ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
5281 :( SYM || HERM || UPP ? max( i, k ) : k ) )
5282 :( SYM || HERM || UPP ? i : 0UL ) );
5283 const size_t jend( ( IsLower_v<MT5> )
5284 ?( ( IsStrictlyLower_v<MT5> )
5285 ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
5286 :( LOW ? min(i+1UL,k) : k ) )
5287 :( LOW ? i+1UL : N ) );
5288
5289 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
5290 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5291
5292 for( size_t j=jbegin; j<jend; ++j ) {
5293 C(i,j) += A(i,k) * B(k,j);
5294 }
5295 if( IsLower_v<MT5> ) {
5296 C(i,jend) = A(i,k) * B(k,jend);
5297 }
5298 }
5299
5300 {
5301 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
5302 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? i+1UL : i )
5303 :( SYM || HERM || UPP ? i : 0UL ) );
5304 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
5305 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? i : i+1UL )
5306 :( LOW ? i+1UL : N ) );
5307
5308 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
5309 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5310
5311 for( size_t j=jbegin; j<jend; ++j ) {
5312 C(i,j) *= scalar;
5313 }
5314 }
5315 }
5316
5317 if( SYM || HERM ) {
5318 for( size_t i=1UL; i<M; ++i ) {
5319 for( size_t j=0UL; j<i; ++j ) {
5320 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5321 }
5322 }
5323 }
5324 }
5325 //**********************************************************************************************
5326
5327 //**Default assignment to dense matrices (general/diagonal)*************************************
5341 template< typename MT3 // Type of the left-hand side target matrix
5342 , typename MT4 // Type of the left-hand side matrix operand
5343 , typename MT5 // Type of the right-hand side matrix operand
5344 , typename ST2 > // Type of the scalar value
5345 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5346 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5347 {
5349
5350 const size_t M( A.rows() );
5351 const size_t N( B.columns() );
5352
5353 for( size_t i=0UL; i<M; ++i )
5354 {
5355 const size_t jbegin( ( IsUpper_v<MT4> )
5356 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5357 :( 0UL ) );
5358 const size_t jend( ( IsLower_v<MT4> )
5359 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5360 :( N ) );
5361 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5362
5363 if( IsUpper_v<MT4> ) {
5364 for( size_t j=0UL; j<jbegin; ++j ) {
5365 reset( C(i,j) );
5366 }
5367 }
5368 for( size_t j=jbegin; j<jend; ++j ) {
5369 C(i,j) = A(i,j) * B(j,j) * scalar;
5370 }
5371 if( IsLower_v<MT4> ) {
5372 for( size_t j=jend; j<N; ++j ) {
5373 reset( C(i,j) );
5374 }
5375 }
5376 }
5377 }
5378 //**********************************************************************************************
5379
5380 //**Default assignment to dense matrices (diagonal/general)*************************************
5394 template< typename MT3 // Type of the left-hand side target matrix
5395 , typename MT4 // Type of the left-hand side matrix operand
5396 , typename MT5 // Type of the right-hand side matrix operand
5397 , typename ST2 > // Type of the scalar value
5398 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5399 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5400 {
5402
5403 const size_t M( A.rows() );
5404 const size_t N( B.columns() );
5405
5406 for( size_t i=0UL; i<M; ++i )
5407 {
5408 const size_t jbegin( ( IsUpper_v<MT5> )
5409 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
5410 :( 0UL ) );
5411 const size_t jend( ( IsLower_v<MT5> )
5412 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
5413 :( N ) );
5414 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5415
5416 if( IsUpper_v<MT5> ) {
5417 for( size_t j=0UL; j<jbegin; ++j ) {
5418 reset( C(i,j) );
5419 }
5420 }
5421 for( size_t j=jbegin; j<jend; ++j ) {
5422 C(i,j) = A(i,i) * B(i,j) * scalar;
5423 }
5424 if( IsLower_v<MT5> ) {
5425 for( size_t j=jend; j<N; ++j ) {
5426 reset( C(i,j) );
5427 }
5428 }
5429 }
5430 }
5431 //**********************************************************************************************
5432
5433 //**Default assignment to dense matrices (diagonal/diagonal)************************************
5447 template< typename MT3 // Type of the left-hand side target matrix
5448 , typename MT4 // Type of the left-hand side matrix operand
5449 , typename MT5 // Type of the right-hand side matrix operand
5450 , typename ST2 > // Type of the scalar value
5451 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5452 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5453 {
5455
5456 reset( C );
5457
5458 for( size_t i=0UL; i<A.rows(); ++i ) {
5459 C(i,i) = A(i,i) * B(i,i) * scalar;
5460 }
5461 }
5462 //**********************************************************************************************
5463
5464 //**Default assignment to dense matrices (small matrices)***************************************
5478 template< typename MT3 // Type of the left-hand side target matrix
5479 , typename MT4 // Type of the left-hand side matrix operand
5480 , typename MT5 // Type of the right-hand side matrix operand
5481 , typename ST2 > // Type of the scalar value
5482 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5483 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5484 {
5485 selectDefaultAssignKernel( C, A, B, scalar );
5486 }
5487 //**********************************************************************************************
5488
5489 //**Vectorized default assignment to row-major dense matrices (small matrices)******************
5504 template< typename MT3 // Type of the left-hand side target matrix
5505 , typename MT4 // Type of the left-hand side matrix operand
5506 , typename MT5 // Type of the right-hand side matrix operand
5507 , typename ST2 > // Type of the scalar value
5508 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5509 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5510 {
5511 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
5512
5513 const size_t M( A.rows() );
5514 const size_t N( B.columns() );
5515 const size_t K( A.columns() );
5516
5517 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5518
5519 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
5520 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
5521
5522 const SIMDType factor( set( scalar ) );
5523
5524 size_t j( 0UL );
5525
5526 if( IsIntegral_v<ElementType> )
5527 {
5528 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5529 for( size_t i=0UL; i<M; ++i )
5530 {
5531 const size_t kbegin( ( IsUpper_v<MT4> )
5532 ?( ( IsLower_v<MT5> )
5533 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5534 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5535 :( IsLower_v<MT5> ? j : 0UL ) );
5536 const size_t kend( ( IsLower_v<MT4> )
5537 ?( ( IsUpper_v<MT5> )
5538 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
5539 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
5540 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
5541
5542 size_t k( kbegin );
5543
5544 if( k < kend )
5545 {
5546 SIMDType a1( set( A(i,k) ) );
5547 SIMDType xmm1( a1 * B.load(k,j ) );
5548 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
5549 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
5550 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
5551 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
5552 SIMDType xmm6( a1 * B.load(k,j+SIMDSIZE*5UL) );
5553 SIMDType xmm7( a1 * B.load(k,j+SIMDSIZE*6UL) );
5554 SIMDType xmm8( a1 * B.load(k,j+SIMDSIZE*7UL) );
5555
5556 for( ++k; k<kend; ++k ) {
5557 a1 = set( A(i,k) );
5558 xmm1 += a1 * B.load(k,j );
5559 xmm2 += a1 * B.load(k,j+SIMDSIZE );
5560 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5561 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5562 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
5563 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
5564 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
5565 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
5566 }
5567
5568 C.store( i, j , xmm1 * factor );
5569 C.store( i, j+SIMDSIZE , xmm2 * factor );
5570 C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5571 C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5572 C.store( i, j+SIMDSIZE*4UL, xmm5 * factor );
5573 C.store( i, j+SIMDSIZE*5UL, xmm6 * factor );
5574 C.store( i, j+SIMDSIZE*6UL, xmm7 * factor );
5575 C.store( i, j+SIMDSIZE*7UL, xmm8 * factor );
5576 }
5577 else
5578 {
5579 const SIMDType zero;
5580 C.store( i, j , zero );
5581 C.store( i, j+SIMDSIZE , zero );
5582 C.store( i, j+SIMDSIZE*2UL, zero );
5583 C.store( i, j+SIMDSIZE*3UL, zero );
5584 C.store( i, j+SIMDSIZE*4UL, zero );
5585 C.store( i, j+SIMDSIZE*5UL, zero );
5586 C.store( i, j+SIMDSIZE*6UL, zero );
5587 C.store( i, j+SIMDSIZE*7UL, zero );
5588 }
5589 }
5590 }
5591 }
5592
5593 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5594 {
5595 size_t i( 0UL );
5596
5597 for( ; (i+2UL) <= M; i+=2UL )
5598 {
5599 const size_t kbegin( ( IsUpper_v<MT4> )
5600 ?( ( IsLower_v<MT5> )
5601 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5602 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5603 :( IsLower_v<MT5> ? j : 0UL ) );
5604 const size_t kend( ( IsLower_v<MT4> )
5605 ?( ( IsUpper_v<MT5> )
5606 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
5607 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5608 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
5609
5610 size_t k( kbegin );
5611
5612 if( k < kend )
5613 {
5614 SIMDType a1( set( A(i ,k) ) );
5615 SIMDType a2( set( A(i+1UL,k) ) );
5616 SIMDType b1( B.load(k,j ) );
5617 SIMDType b2( B.load(k,j+SIMDSIZE ) );
5618 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5619 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5620 SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5621 SIMDType xmm1 ( a1 * b1 );
5622 SIMDType xmm2 ( a1 * b2 );
5623 SIMDType xmm3 ( a1 * b3 );
5624 SIMDType xmm4 ( a1 * b4 );
5625 SIMDType xmm5 ( a1 * b5 );
5626 SIMDType xmm6 ( a2 * b1 );
5627 SIMDType xmm7 ( a2 * b2 );
5628 SIMDType xmm8 ( a2 * b3 );
5629 SIMDType xmm9 ( a2 * b4 );
5630 SIMDType xmm10( a2 * b5 );
5631
5632 for( ++k; k<kend; ++k ) {
5633 a1 = set( A(i ,k) );
5634 a2 = set( A(i+1UL,k) );
5635 b1 = B.load(k,j );
5636 b2 = B.load(k,j+SIMDSIZE );
5637 b3 = B.load(k,j+SIMDSIZE*2UL);
5638 b4 = B.load(k,j+SIMDSIZE*3UL);
5639 b5 = B.load(k,j+SIMDSIZE*4UL);
5640 xmm1 += a1 * b1;
5641 xmm2 += a1 * b2;
5642 xmm3 += a1 * b3;
5643 xmm4 += a1 * b4;
5644 xmm5 += a1 * b5;
5645 xmm6 += a2 * b1;
5646 xmm7 += a2 * b2;
5647 xmm8 += a2 * b3;
5648 xmm9 += a2 * b4;
5649 xmm10 += a2 * b5;
5650 }
5651
5652 C.store( i , j , xmm1 * factor );
5653 C.store( i , j+SIMDSIZE , xmm2 * factor );
5654 C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5655 C.store( i , j+SIMDSIZE*3UL, xmm4 * factor );
5656 C.store( i , j+SIMDSIZE*4UL, xmm5 * factor );
5657 C.store( i+1UL, j , xmm6 * factor );
5658 C.store( i+1UL, j+SIMDSIZE , xmm7 * factor );
5659 C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
5660 C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
5661 C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
5662 }
5663 else
5664 {
5665 const SIMDType zero;
5666 C.store( i , j , zero );
5667 C.store( i , j+SIMDSIZE , zero );
5668 C.store( i , j+SIMDSIZE*2UL, zero );
5669 C.store( i , j+SIMDSIZE*3UL, zero );
5670 C.store( i , j+SIMDSIZE*4UL, zero );
5671 C.store( i+1UL, j , zero );
5672 C.store( i+1UL, j+SIMDSIZE , zero );
5673 C.store( i+1UL, j+SIMDSIZE*2UL, zero );
5674 C.store( i+1UL, j+SIMDSIZE*3UL, zero );
5675 C.store( i+1UL, j+SIMDSIZE*4UL, zero );
5676 }
5677 }
5678
5679 if( i < M )
5680 {
5681 const size_t kbegin( ( IsUpper_v<MT4> )
5682 ?( ( IsLower_v<MT5> )
5683 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5684 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5685 :( IsLower_v<MT5> ? j : 0UL ) );
5686 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
5687
5688 size_t k( kbegin );
5689
5690 if( k < kend )
5691 {
5692 SIMDType a1( set( A(i,k) ) );
5693 SIMDType xmm1( a1 * B.load(k,j ) );
5694 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
5695 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
5696 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
5697 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
5698
5699 for( ++k; k<kend; ++k ) {
5700 a1 = set( A(i,k) );
5701 xmm1 += a1 * B.load(k,j );
5702 xmm2 += a1 * B.load(k,j+SIMDSIZE );
5703 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5704 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5705 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
5706 }
5707
5708 C.store( i, j , xmm1 * factor );
5709 C.store( i, j+SIMDSIZE , xmm2 * factor );
5710 C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5711 C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5712 C.store( i, j+SIMDSIZE*4UL, xmm5 * factor );
5713 }
5714 else
5715 {
5716 const SIMDType zero;
5717 C.store( i, j , zero );
5718 C.store( i, j+SIMDSIZE , zero );
5719 C.store( i, j+SIMDSIZE*2UL, zero );
5720 C.store( i, j+SIMDSIZE*3UL, zero );
5721 C.store( i, j+SIMDSIZE*4UL, zero );
5722 }
5723 }
5724 }
5725
5726 for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5727 {
5728 const size_t iend( UPP ? min(j+SIMDSIZE*4UL,M) : M );
5729 size_t i( 0UL );
5730
5731 if( SYM || HERM ) {
5732 const size_t jjend( min(j+SIMDSIZE*4UL,N) );
5733 for( ; i<j; ++i ) {
5734 for( size_t jj=j; jj<jjend; ++jj ) {
5735 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
5736 }
5737 }
5738 }
5739 else if( LOW ) {
5740 const size_t jjend( min(j+SIMDSIZE*4UL,N) );
5741 for( ; i<j; ++i ) {
5742 for( size_t jj=j; jj<jjend; ++jj ) {
5743 reset( C(i,jj) );
5744 }
5745 }
5746 }
5747
5748 for( ; (i+2UL) <= iend; i+=2UL )
5749 {
5750 const size_t kbegin( ( IsUpper_v<MT4> )
5751 ?( ( IsLower_v<MT5> )
5752 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5753 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5754 :( IsLower_v<MT5> ? j : 0UL ) );
5755 const size_t kend( ( IsLower_v<MT4> )
5756 ?( ( IsUpper_v<MT5> )
5757 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
5758 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5759 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
5760
5761 size_t k( kbegin );
5762
5763 if( k < kend )
5764 {
5765 SIMDType a1( set( A(i ,k) ) );
5766 SIMDType a2( set( A(i+1UL,k) ) );
5767 SIMDType b1( B.load(k,j ) );
5768 SIMDType b2( B.load(k,j+SIMDSIZE ) );
5769 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5770 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5771 SIMDType xmm1( a1 * b1 );
5772 SIMDType xmm2( a1 * b2 );
5773 SIMDType xmm3( a1 * b3 );
5774 SIMDType xmm4( a1 * b4 );
5775 SIMDType xmm5( a2 * b1 );
5776 SIMDType xmm6( a2 * b2 );
5777 SIMDType xmm7( a2 * b3 );
5778 SIMDType xmm8( a2 * b4 );
5779
5780 for( ++k; k<kend; ++k ) {
5781 a1 = set( A(i ,k) );
5782 a2 = set( A(i+1UL,k) );
5783 b1 = B.load(k,j );
5784 b2 = B.load(k,j+SIMDSIZE );
5785 b3 = B.load(k,j+SIMDSIZE*2UL);
5786 b4 = B.load(k,j+SIMDSIZE*3UL);
5787 xmm1 += a1 * b1;
5788 xmm2 += a1 * b2;
5789 xmm3 += a1 * b3;
5790 xmm4 += a1 * b4;
5791 xmm5 += a2 * b1;
5792 xmm6 += a2 * b2;
5793 xmm7 += a2 * b3;
5794 xmm8 += a2 * b4;
5795 }
5796
5797 C.store( i , j , xmm1 * factor );
5798 C.store( i , j+SIMDSIZE , xmm2 * factor );
5799 C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5800 C.store( i , j+SIMDSIZE*3UL, xmm4 * factor );
5801 C.store( i+1UL, j , xmm5 * factor );
5802 C.store( i+1UL, j+SIMDSIZE , xmm6 * factor );
5803 C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
5804 C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
5805 }
5806 else
5807 {
5808 const SIMDType zero;
5809 C.store( i , j , zero );
5810 C.store( i , j+SIMDSIZE , zero );
5811 C.store( i , j+SIMDSIZE*2UL, zero );
5812 C.store( i , j+SIMDSIZE*3UL, zero );
5813 C.store( i+1UL, j , zero );
5814 C.store( i+1UL, j+SIMDSIZE , zero );
5815 C.store( i+1UL, j+SIMDSIZE*2UL, zero );
5816 C.store( i+1UL, j+SIMDSIZE*3UL, zero );
5817 }
5818 }
5819
5820 if( i < iend )
5821 {
5822 const size_t kbegin( ( IsUpper_v<MT4> )
5823 ?( ( IsLower_v<MT5> )
5824 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5825 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5826 :( IsLower_v<MT5> ? j : 0UL ) );
5827 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
5828
5829 size_t k( kbegin );
5830
5831 if( k < kend )
5832 {
5833 SIMDType a1( set( A(i,k) ) );
5834 SIMDType xmm1( a1 * B.load(k,j ) );
5835 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
5836 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
5837 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
5838
5839 for( ++k; k<kend; ++k ) {
5840 a1 = set( A(i,k) );
5841 xmm1 += a1 * B.load(k,j );
5842 xmm2 += a1 * B.load(k,j+SIMDSIZE );
5843 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5844 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5845 }
5846
5847 C.store( i, j , xmm1 * factor );
5848 C.store( i, j+SIMDSIZE , xmm2 * factor );
5849 C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5850 C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5851 }
5852 else
5853 {
5854 const SIMDType zero;
5855 C.store( i, j , zero );
5856 C.store( i, j+SIMDSIZE , zero );
5857 C.store( i, j+SIMDSIZE*2UL, zero );
5858 C.store( i, j+SIMDSIZE*3UL, zero );
5859 }
5860
5861 if( UPP ) ++i;
5862 }
5863
5864 if( UPP ) {
5865 const size_t jjend( min(j+SIMDSIZE*4UL,N) );
5866 for( ; i<M; ++i ) {
5867 for( size_t jj=j; jj<jjend; ++jj ) {
5868 reset( C(i,jj) );
5869 }
5870 }
5871 }
5872 }
5873
5874 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5875 {
5876 const size_t iend( UPP ? min(j+SIMDSIZE*3UL,M) : M );
5877 size_t i( 0UL );
5878
5879 if( SYM || HERM ) {
5880 const size_t jjend( min(j+SIMDSIZE*3UL,N) );
5881 for( ; i<j; ++i ) {
5882 for( size_t jj=j; jj<jjend; ++jj ) {
5883 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
5884 }
5885 }
5886 }
5887 else if( LOW ) {
5888 const size_t jjend( min(j+SIMDSIZE*3UL,N) );
5889 for( ; i<j; ++i ) {
5890 for( size_t jj=j; jj<jjend; ++jj ) {
5891 reset( C(i,jj) );
5892 }
5893 }
5894 }
5895
5896 for( ; (i+2UL) <= iend; i+=2UL )
5897 {
5898 const size_t kbegin( ( IsUpper_v<MT4> )
5899 ?( ( IsLower_v<MT5> )
5900 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5901 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5902 :( IsLower_v<MT5> ? j : 0UL ) );
5903 const size_t kend( ( IsLower_v<MT4> )
5904 ?( ( IsUpper_v<MT5> )
5905 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
5906 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5907 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
5908
5909 size_t k( kbegin );
5910
5911 if( k < kend )
5912 {
5913 SIMDType a1( set( A(i ,k) ) );
5914 SIMDType a2( set( A(i+1UL,k) ) );
5915 SIMDType b1( B.load(k,j ) );
5916 SIMDType b2( B.load(k,j+SIMDSIZE ) );
5917 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5918 SIMDType xmm1( a1 * b1 );
5919 SIMDType xmm2( a1 * b2 );
5920 SIMDType xmm3( a1 * b3 );
5921 SIMDType xmm4( a2 * b1 );
5922 SIMDType xmm5( a2 * b2 );
5923 SIMDType xmm6( a2 * b3 );
5924
5925 for( ++k; k<kend; ++k ) {
5926 a1 = set( A(i ,k) );
5927 a2 = set( A(i+1UL,k) );
5928 b1 = B.load(k,j );
5929 b2 = B.load(k,j+SIMDSIZE );
5930 b3 = B.load(k,j+SIMDSIZE*2UL);
5931 xmm1 += a1 * b1;
5932 xmm2 += a1 * b2;
5933 xmm3 += a1 * b3;
5934 xmm4 += a2 * b1;
5935 xmm5 += a2 * b2;
5936 xmm6 += a2 * b3;
5937 }
5938
5939 C.store( i , j , xmm1 * factor );
5940 C.store( i , j+SIMDSIZE , xmm2 * factor );
5941 C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5942 C.store( i+1UL, j , xmm4 * factor );
5943 C.store( i+1UL, j+SIMDSIZE , xmm5 * factor );
5944 C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
5945 }
5946 else
5947 {
5948 const SIMDType zero;
5949 C.store( i , j , zero );
5950 C.store( i , j+SIMDSIZE , zero );
5951 C.store( i , j+SIMDSIZE*2UL, zero );
5952 C.store( i+1UL, j , zero );
5953 C.store( i+1UL, j+SIMDSIZE , zero );
5954 C.store( i+1UL, j+SIMDSIZE*2UL, zero );
5955 }
5956 }
5957
5958 if( i < iend )
5959 {
5960 const size_t kbegin( ( IsUpper_v<MT4> )
5961 ?( ( IsLower_v<MT5> )
5962 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5963 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5964 :( IsLower_v<MT5> ? j : 0UL ) );
5965 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
5966
5967 size_t k( kbegin );
5968
5969 if( k < kend )
5970 {
5971 SIMDType a1( set( A(i,k) ) );
5972 SIMDType xmm1( a1 * B.load(k,j ) );
5973 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
5974 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
5975
5976 for( ++k; k<kend; ++k ) {
5977 a1 = set( A(i,k) );
5978 xmm1 += a1 * B.load(k,j );
5979 xmm2 += a1 * B.load(k,j+SIMDSIZE );
5980 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5981 }
5982
5983 C.store( i, j , xmm1 * factor );
5984 C.store( i, j+SIMDSIZE , xmm2 * factor );
5985 C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5986 }
5987 else
5988 {
5989 const SIMDType zero;
5990 C.store( i, j , zero );
5991 C.store( i, j+SIMDSIZE , zero );
5992 C.store( i, j+SIMDSIZE*2UL, zero );
5993 }
5994
5995 if( UPP ) ++i;
5996 }
5997
5998 if( UPP ) {
5999 const size_t jjend( min(j+SIMDSIZE*3UL,N) );
6000 for( ; i<M; ++i ) {
6001 for( size_t jj=j; jj<jjend; ++jj ) {
6002 reset( C(i,jj) );
6003 }
6004 }
6005 }
6006 }
6007
6008 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6009 {
6010 const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
6011 size_t i( 0UL );
6012
6013 if( SYM || HERM ) {
6014 const size_t jjend( min(j+SIMDSIZE*2UL,N) );
6015 for( ; i<j; ++i ) {
6016 for( size_t jj=j; jj<jjend; ++jj ) {
6017 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
6018 }
6019 }
6020 }
6021 else if( LOW ) {
6022 const size_t jjend( min(j+SIMDSIZE*2UL,N) );
6023 for( ; i<j; ++i ) {
6024 for( size_t jj=j; jj<jjend; ++jj ) {
6025 reset( C(i,jj) );
6026 }
6027 }
6028 }
6029
6030 for( ; (i+4UL) <= iend; i+=4UL )
6031 {
6032 const size_t kbegin( ( IsUpper_v<MT4> )
6033 ?( ( IsLower_v<MT5> )
6034 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6035 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6036 :( IsLower_v<MT5> ? j : 0UL ) );
6037 const size_t kend( ( IsLower_v<MT4> )
6038 ?( ( IsUpper_v<MT5> )
6039 ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
6040 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
6041 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
6042
6043 size_t k( kbegin );
6044
6045 if( k < kend )
6046 {
6047 SIMDType a1( set( A(i ,k) ) );
6048 SIMDType a2( set( A(i+1UL,k) ) );
6049 SIMDType a3( set( A(i+2UL,k) ) );
6050 SIMDType a4( set( A(i+3UL,k) ) );
6051 SIMDType b1( B.load(k,j ) );
6052 SIMDType b2( B.load(k,j+SIMDSIZE) );
6053 SIMDType xmm1( a1 * b1 );
6054 SIMDType xmm2( a1 * b2 );
6055 SIMDType xmm3( a2 * b1 );
6056 SIMDType xmm4( a2 * b2 );
6057 SIMDType xmm5( a3 * b1 );
6058 SIMDType xmm6( a3 * b2 );
6059 SIMDType xmm7( a4 * b1 );
6060 SIMDType xmm8( a4 * b2 );
6061
6062 for( ++k; k<kend; ++k ) {
6063 a1 = set( A(i ,k) );
6064 a2 = set( A(i+1UL,k) );
6065 a3 = set( A(i+2UL,k) );
6066 a4 = set( A(i+3UL,k) );
6067 b1 = B.load(k,j );
6068 b2 = B.load(k,j+SIMDSIZE);
6069 xmm1 += a1 * b1;
6070 xmm2 += a1 * b2;
6071 xmm3 += a2 * b1;
6072 xmm4 += a2 * b2;
6073 xmm5 += a3 * b1;
6074 xmm6 += a3 * b2;
6075 xmm7 += a4 * b1;
6076 xmm8 += a4 * b2;
6077 }
6078
6079 C.store( i , j , xmm1 * factor );
6080 C.store( i , j+SIMDSIZE, xmm2 * factor );
6081 C.store( i+1UL, j , xmm3 * factor );
6082 C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
6083 C.store( i+2UL, j , xmm5 * factor );
6084 C.store( i+2UL, j+SIMDSIZE, xmm6 * factor );
6085 C.store( i+3UL, j , xmm7 * factor );
6086 C.store( i+3UL, j+SIMDSIZE, xmm8 * factor );
6087 }
6088 else
6089 {
6090 const SIMDType zero;
6091 C.store( i , j , zero );
6092 C.store( i , j+SIMDSIZE, zero );
6093 C.store( i+1UL, j , zero );
6094 C.store( i+1UL, j+SIMDSIZE, zero );
6095 C.store( i+2UL, j , zero );
6096 C.store( i+2UL, j+SIMDSIZE, zero );
6097 C.store( i+3UL, j , zero );
6098 C.store( i+3UL, j+SIMDSIZE, zero );
6099 }
6100 }
6101
6102 for( ; (i+3UL) <= iend; i+=3UL )
6103 {
6104 const size_t kbegin( ( IsUpper_v<MT4> )
6105 ?( ( IsLower_v<MT5> )
6106 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6107 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6108 :( IsLower_v<MT5> ? j : 0UL ) );
6109 const size_t kend( ( IsLower_v<MT4> )
6110 ?( ( IsUpper_v<MT5> )
6111 ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
6112 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
6113 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
6114
6115 size_t k( kbegin );
6116
6117 if( k < kend )
6118 {
6119 SIMDType a1( set( A(i ,k) ) );
6120 SIMDType a2( set( A(i+1UL,k) ) );
6121 SIMDType a3( set( A(i+2UL,k) ) );
6122 SIMDType b1( B.load(k,j ) );
6123 SIMDType b2( B.load(k,j+SIMDSIZE) );
6124 SIMDType xmm1( a1 * b1 );
6125 SIMDType xmm2( a1 * b2 );
6126 SIMDType xmm3( a2 * b1 );
6127 SIMDType xmm4( a2 * b2 );
6128 SIMDType xmm5( a3 * b1 );
6129 SIMDType xmm6( a3 * b2 );
6130
6131 for( ++k; k<kend; ++k ) {
6132 a1 = set( A(i ,k) );
6133 a2 = set( A(i+1UL,k) );
6134 a3 = set( A(i+2UL,k) );
6135 b1 = B.load(k,j );
6136 b2 = B.load(k,j+SIMDSIZE);
6137 xmm1 += a1 * b1;
6138 xmm2 += a1 * b2;
6139 xmm3 += a2 * b1;
6140 xmm4 += a2 * b2;
6141 xmm5 += a3 * b1;
6142 xmm6 += a3 * b2;
6143 }
6144
6145 C.store( i , j , xmm1 * factor );
6146 C.store( i , j+SIMDSIZE, xmm2 * factor );
6147 C.store( i+1UL, j , xmm3 * factor );
6148 C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
6149 C.store( i+2UL, j , xmm5 * factor );
6150 C.store( i+2UL, j+SIMDSIZE, xmm6 * factor );
6151 }
6152 else
6153 {
6154 const SIMDType zero;
6155 C.store( i , j , zero );
6156 C.store( i , j+SIMDSIZE, zero );
6157 C.store( i+1UL, j , zero );
6158 C.store( i+1UL, j+SIMDSIZE, zero );
6159 C.store( i+2UL, j , zero );
6160 C.store( i+2UL, j+SIMDSIZE, zero );
6161 }
6162 }
6163
6164 for( ; (i+2UL) <= iend; i+=2UL )
6165 {
6166 const size_t kbegin( ( IsUpper_v<MT4> )
6167 ?( ( IsLower_v<MT5> )
6168 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6169 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6170 :( IsLower_v<MT5> ? j : 0UL ) );
6171 const size_t kend( ( IsLower_v<MT4> )
6172 ?( ( IsUpper_v<MT5> )
6173 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
6174 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6175 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
6176
6177 size_t k( kbegin );
6178
6179 if( k < kend )
6180 {
6181 SIMDType a1( set( A(i ,k) ) );
6182 SIMDType a2( set( A(i+1UL,k) ) );
6183 SIMDType b1( B.load(k,j ) );
6184 SIMDType b2( B.load(k,j+SIMDSIZE) );
6185 SIMDType xmm1( a1 * b1 );
6186 SIMDType xmm2( a1 * b2 );
6187 SIMDType xmm3( a2 * b1 );
6188 SIMDType xmm4( a2 * b2 );
6189
6190 for( ++k; k<kend; ++k ) {
6191 a1 = set( A(i ,k) );
6192 a2 = set( A(i+1UL,k) );
6193 b1 = B.load(k,j );
6194 b2 = B.load(k,j+SIMDSIZE);
6195 xmm1 += a1 * b1;
6196 xmm2 += a1 * b2;
6197 xmm3 += a2 * b1;
6198 xmm4 += a2 * b2;
6199 }
6200
6201 C.store( i , j , xmm1 * factor );
6202 C.store( i , j+SIMDSIZE, xmm2 * factor );
6203 C.store( i+1UL, j , xmm3 * factor );
6204 C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
6205 }
6206 else
6207 {
6208 const SIMDType zero;
6209 C.store( i , j , zero );
6210 C.store( i , j+SIMDSIZE, zero );
6211 C.store( i+1UL, j , zero );
6212 C.store( i+1UL, j+SIMDSIZE, zero );
6213 }
6214 }
6215
6216 if( i < iend )
6217 {
6218 const size_t kbegin( ( IsUpper_v<MT4> )
6219 ?( ( IsLower_v<MT5> )
6220 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6221 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6222 :( IsLower_v<MT5> ? j : 0UL ) );
6223 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
6224
6225 size_t k( kbegin );
6226
6227 if( k < kend )
6228 {
6229 SIMDType a1( set( A(i,k) ) );
6230 SIMDType xmm1( a1 * B.load(k,j ) );
6231 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE) );
6232
6233 for( ++k; k<kend; ++k ) {
6234 a1 = set( A(i,k) );
6235 xmm1 += a1 * B.load(k,j );
6236 xmm2 += a1 * B.load(k,j+SIMDSIZE);
6237 }
6238
6239 C.store( i, j , xmm1 * factor );
6240 C.store( i, j+SIMDSIZE, xmm2 * factor );
6241 }
6242 else
6243 {
6244 const SIMDType zero;
6245 C.store( i, j , zero );
6246 C.store( i, j+SIMDSIZE, zero );
6247 }
6248
6249 if( UPP ) ++i;
6250 }
6251
6252 if( UPP ) {
6253 const size_t jjend( min(j+SIMDSIZE*2UL,N) );
6254 for( ; i<M; ++i ) {
6255 for( size_t jj=j; jj<jjend; ++jj ) {
6256 reset( C(i,jj) );
6257 }
6258 }
6259 }
6260 }
6261
6262 for( ; j<jpos; j+=SIMDSIZE )
6263 {
6264 const size_t iend( UPP ? min(j+SIMDSIZE,M) : M );
6265 size_t i( 0UL );
6266
6267 if( SYM || HERM ) {
6268 const size_t jjend( min(j+SIMDSIZE,N) );
6269 for( ; i<j; ++i ) {
6270 for( size_t jj=j; jj<jjend; ++jj ) {
6271 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
6272 }
6273 }
6274 }
6275 else if( LOW ) {
6276 const size_t jjend( min(j+SIMDSIZE,N) );
6277 for( ; i<j; ++i ) {
6278 for( size_t jj=j; jj<jjend; ++jj ) {
6279 reset( C(i,jj) );
6280 }
6281 }
6282 }
6283
6284 for( ; (i+4UL) <= iend; i+=4UL )
6285 {
6286 const size_t kbegin( ( IsUpper_v<MT4> )
6287 ?( ( IsLower_v<MT5> )
6288 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6289 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6290 :( IsLower_v<MT5> ? j : 0UL ) );
6291 const size_t kend( ( IsLower_v<MT4> )
6292 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
6293 :( K ) );
6294
6295 size_t k( kbegin );
6296
6297 if( k < kend )
6298 {
6299 SIMDType b1( B.load(k,j) );
6300 SIMDType xmm1( set( A(i ,k) ) * b1 );
6301 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
6302 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
6303 SIMDType xmm4( set( A(i+3UL,k) ) * b1 );
6304
6305 for( ++k; k<kend; ++k ) {
6306 b1 = B.load(k,j);
6307 xmm1 += set( A(i ,k) ) * b1;
6308 xmm2 += set( A(i+1UL,k) ) * b1;
6309 xmm3 += set( A(i+2UL,k) ) * b1;
6310 xmm4 += set( A(i+3UL,k) ) * b1;
6311 }
6312
6313 C.store( i , j, xmm1 * factor );
6314 C.store( i+1UL, j, xmm2 * factor );
6315 C.store( i+2UL, j, xmm3 * factor );
6316 C.store( i+3UL, j, xmm4 * factor );
6317 }
6318 else
6319 {
6320 const SIMDType zero;
6321 C.store( i , j, zero );
6322 C.store( i+1UL, j, zero );
6323 C.store( i+2UL, j, zero );
6324 C.store( i+3UL, j, zero );
6325 }
6326 }
6327
6328 for( ; (i+3UL) <= iend; i+=3UL )
6329 {
6330 const size_t kbegin( ( IsUpper_v<MT4> )
6331 ?( ( IsLower_v<MT5> )
6332 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6333 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6334 :( IsLower_v<MT5> ? j : 0UL ) );
6335 const size_t kend( ( IsLower_v<MT4> )
6336 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
6337 :( K ) );
6338
6339 size_t k( kbegin );
6340
6341 if( k < kend )
6342 {
6343 SIMDType b1( B.load(k,j) );
6344 SIMDType xmm1( set( A(i ,k) ) * b1 );
6345 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
6346 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
6347
6348 for( ++k; k<kend; ++k ) {
6349 b1 = B.load(k,j);
6350 xmm1 += set( A(i ,k) ) * b1;
6351 xmm2 += set( A(i+1UL,k) ) * b1;
6352 xmm3 += set( A(i+2UL,k) ) * b1;
6353 }
6354
6355 C.store( i , j, xmm1 * factor );
6356 C.store( i+1UL, j, xmm2 * factor );
6357 C.store( i+2UL, j, xmm3 * factor );
6358 }
6359 else
6360 {
6361 const SIMDType zero;
6362 C.store( i , j, zero );
6363 C.store( i+1UL, j, zero );
6364 C.store( i+2UL, j, zero );
6365 }
6366 }
6367
6368 for( ; (i+2UL) <= iend; i+=2UL )
6369 {
6370 const size_t kbegin( ( IsUpper_v<MT4> )
6371 ?( ( IsLower_v<MT5> )
6372 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6373 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6374 :( IsLower_v<MT5> ? j : 0UL ) );
6375 const size_t kend( ( IsLower_v<MT4> )
6376 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
6377 :( K ) );
6378
6379 size_t k( kbegin );
6380
6381 if( k < kend )
6382 {
6383 SIMDType b1( B.load(k,j) );
6384 SIMDType xmm1( set( A(i ,k) ) * b1 );
6385 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
6386
6387 for( ++k; k<kend; ++k ) {
6388 b1 = B.load(k,j);
6389 xmm1 += set( A(i ,k) ) * b1;
6390 xmm2 += set( A(i+1UL,k) ) * b1;
6391 }
6392
6393 C.store( i , j, xmm1 * factor );
6394 C.store( i+1UL, j, xmm2 * factor );
6395 }
6396 else
6397 {
6398 const SIMDType zero;
6399 C.store( i , j, zero );
6400 C.store( i+1UL, j, zero );
6401 }
6402 }
6403
6404 if( i < iend )
6405 {
6406 const size_t kbegin( ( IsUpper_v<MT4> )
6407 ?( ( IsLower_v<MT5> )
6408 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6409 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6410 :( IsLower_v<MT5> ? j : 0UL ) );
6411
6412 size_t k( kbegin );
6413
6414 if( k < K )
6415 {
6416 SIMDType xmm1( set( A(i,k) ) * B.load(k,j) );
6417
6418 for( ++k; k<K; ++k ) {
6419 xmm1 += set( A(i,k) ) * B.load(k,j);
6420 }
6421
6422 C.store( i, j, xmm1 * factor );
6423 }
6424 else
6425 {
6426 const SIMDType zero;
6427 C.store( i, j, zero );
6428 }
6429
6430 if( UPP ) ++i;
6431 }
6432
6433 if( UPP ) {
6434 const size_t jjend( min(j+SIMDSIZE,N) );
6435 for( ; i<M; ++i ) {
6436 for( size_t jj=j; jj<jjend; ++jj ) {
6437 reset( C(i,jj) );
6438 }
6439 }
6440 }
6441 }
6442
6443 for( ; remainder && j<N; ++j )
6444 {
6445 size_t i( 0UL );
6446
6447 if( SYM || HERM ) {
6448 for( ; i<j; ++i ) {
6449 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
6450 }
6451 }
6452 else if( LOW ) {
6453 for( ; i<j; ++i ) {
6454 reset( C(i,j) );
6455 }
6456 }
6457
6458 for( ; (i+2UL) <= M; i+=2UL )
6459 {
6460 const size_t kbegin( ( IsUpper_v<MT4> )
6461 ?( ( IsLower_v<MT5> )
6462 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6463 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6464 :( IsLower_v<MT5> ? j : 0UL ) );
6465 const size_t kend( ( IsLower_v<MT4> )
6466 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
6467 :( K ) );
6468
6469 size_t k( kbegin );
6470
6471 if( k < kend )
6472 {
6473 ElementType value1( A(i ,k) * B(k,j) );
6474 ElementType value2( A(i+1UL,k) * B(k,j) );
6475
6476 for( ++k; k<kend; ++k ) {
6477 value1 += A(i ,k) * B(k,j);
6478 value2 += A(i+1UL,k) * B(k,j);
6479 }
6480
6481 C(i ,j) = value1 * scalar;
6482 C(i+1UL,j) = value2 * scalar;
6483 }
6484 else
6485 {
6486 reset( C(i ,j) );
6487 reset( C(i+1UL,j) );
6488 }
6489 }
6490
6491 if( i < M )
6492 {
6493 const size_t kbegin( ( IsUpper_v<MT4> )
6494 ?( ( IsLower_v<MT5> )
6495 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6496 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6497 :( IsLower_v<MT5> ? j : 0UL ) );
6498
6499 size_t k( kbegin );
6500
6501 if( k < K )
6502 {
6503 ElementType value( A(i,k) * B(k,j) );
6504
6505 for( ++k; k<K; ++k ) {
6506 value += A(i,k) * B(k,j);
6507 }
6508
6509 C(i,j) = value * scalar;
6510 }
6511 else
6512 {
6513 reset( C(i,j) );
6514 }
6515 }
6516 }
6517 }
6518 //**********************************************************************************************
6519
6520 //**Vectorized default assignment to column-major dense matrices (small matrices)***************
6535 template< typename MT3 // Type of the left-hand side target matrix
6536 , typename MT4 // Type of the left-hand side matrix operand
6537 , typename MT5 // Type of the right-hand side matrix operand
6538 , typename ST2 > // Type of the scalar value
6539 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6540 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6541 {
6546
6547 const ForwardFunctor fwd;
6548
6549 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
6550 const OppositeType_t<MT4> tmp( serial( A ) );
6551 assign( C, fwd( tmp * B ) * scalar );
6552 }
6553 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
6554 const OppositeType_t<MT5> tmp( serial( B ) );
6555 assign( C, fwd( A * tmp ) * scalar );
6556 }
6557 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6558 const OppositeType_t<MT4> tmp( serial( A ) );
6559 assign( C, fwd( tmp * B ) * scalar );
6560 }
6561 else {
6562 const OppositeType_t<MT5> tmp( serial( B ) );
6563 assign( C, fwd( A * tmp ) * scalar );
6564 }
6565 }
6566 //**********************************************************************************************
6567
6568 //**Default assignment to dense matrices (large matrices)***************************************
6582 template< typename MT3 // Type of the left-hand side target matrix
6583 , typename MT4 // Type of the left-hand side matrix operand
6584 , typename MT5 // Type of the right-hand side matrix operand
6585 , typename ST2 > // Type of the scalar value
6586 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6587 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6588 {
6589 selectDefaultAssignKernel( C, A, B, scalar );
6590 }
6591 //**********************************************************************************************
6592
6593 //**Vectorized default assignment to dense matrices (large matrices)****************************
6608 template< typename MT3 // Type of the left-hand side target matrix
6609 , typename MT4 // Type of the left-hand side matrix operand
6610 , typename MT5 // Type of the right-hand side matrix operand
6611 , typename ST2 > // Type of the scalar value
6612 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6613 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6614 {
6615 if( SYM )
6616 smmm( C, A, B, scalar );
6617 else if( HERM )
6618 hmmm( C, A, B, scalar );
6619 else if( LOW )
6620 lmmm( C, A, B, scalar, ST2(0) );
6621 else if( UPP )
6622 ummm( C, A, B, scalar, ST2(0) );
6623 else
6624 mmm( C, A, B, scalar, ST2(0) );
6625 }
6626 //**********************************************************************************************
6627
6628 //**BLAS-based assignment to dense matrices (default)*******************************************
6642 template< typename MT3 // Type of the left-hand side target matrix
6643 , typename MT4 // Type of the left-hand side matrix operand
6644 , typename MT5 // Type of the right-hand side matrix operand
6645 , typename ST2 > // Type of the scalar value
6646 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6647 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6648 {
6649 selectLargeAssignKernel( C, A, B, scalar );
6650 }
6651 //**********************************************************************************************
6652
6653 //**BLAS-based assignment to dense matrices*****************************************************
6654#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6668 template< typename MT3 // Type of the left-hand side target matrix
6669 , typename MT4 // Type of the left-hand side matrix operand
6670 , typename MT5 // Type of the right-hand side matrix operand
6671 , typename ST2 > // Type of the scalar value
6672 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6673 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6674 {
6675 using ET = ElementType_t<MT3>;
6676
6677 if( IsTriangular_v<MT4> ) {
6678 assign( C, B );
6679 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6680 }
6681 else if( IsTriangular_v<MT5> ) {
6682 assign( C, A );
6683 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6684 }
6685 else {
6686 gemm( C, A, B, ET(scalar), ET(0) );
6687 }
6688 }
6689#endif
6690 //**********************************************************************************************
6691
6692 //**Assignment to sparse matrices***************************************************************
6704 template< typename MT // Type of the target sparse matrix
6705 , bool SO > // Storage order of the target sparse matrix
6706 friend inline auto assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6707 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6708 {
6710
6711 using TmpType = If_t< SO, OppositeType, ResultType >;
6712
6719
6720 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6721 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6722
6723 const ForwardFunctor fwd;
6724
6725 const TmpType tmp( serial( rhs ) );
6726 assign( *lhs, fwd( tmp ) );
6727 }
6728 //**********************************************************************************************
6729
6730 //**Restructuring assignment to column-major matrices*******************************************
6744 template< typename MT > // Type of the target matrix
6745 friend inline auto assign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
6746 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6747 {
6749
6751
6752 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6753 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6754
6755 const ForwardFunctor fwd;
6756
6757 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
6758 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
6759
6760 assign( *lhs, fwd( A * B ) * rhs.scalar_ );
6761 }
6762 //**********************************************************************************************
6763
6764 //**Addition assignment to dense matrices*******************************************************
6776 template< typename MT // Type of the target dense matrix
6777 , bool SO > // Storage order of the target dense matrix
6778 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6779 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6780 {
6782
6783 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6784 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6785
6786 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6787 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6788
6789 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
6790 return;
6791 }
6792
6793 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6794 RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6795
6796 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6797 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6798 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6799 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6800 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
6801 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
6802
6803 DMatScalarMultExpr::selectAddAssignKernel( *lhs, A, B, rhs.scalar_ );
6804 }
6805 //**********************************************************************************************
6806
6807 //**Addition assignment to dense matrices (kernel selection)************************************
6818 template< typename MT3 // Type of the left-hand side target matrix
6819 , typename MT4 // Type of the left-hand side matrix operand
6820 , typename MT5 // Type of the right-hand side matrix operand
6821 , typename ST2 > // Type of the scalar value
6822 static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6823 {
6824 if( ( IsDiagonal_v<MT5> ) ||
6825 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
6826 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6827 selectSmallAddAssignKernel( C, A, B, scalar );
6828 else
6829 selectBlasAddAssignKernel( C, A, B, scalar );
6830 }
6831 //**********************************************************************************************
6832
6833 //**Default addition assignment to dense matrices (general/general)*****************************
6847 template< typename MT3 // Type of the left-hand side target matrix
6848 , typename MT4 // Type of the left-hand side matrix operand
6849 , typename MT5 // Type of the right-hand side matrix operand
6850 , typename ST2 > // Type of the scalar value
6851 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6852 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6853 {
6854 const ResultType tmp( serial( A * B * scalar ) );
6855 addAssign( C, tmp );
6856 }
6857 //**********************************************************************************************
6858
6859 //**Default addition assignment to dense matrices (general/diagonal)****************************
6873 template< typename MT3 // Type of the left-hand side target matrix
6874 , typename MT4 // Type of the left-hand side matrix operand
6875 , typename MT5 // Type of the right-hand side matrix operand
6876 , typename ST2 > // Type of the scalar value
6877 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6878 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6879 {
6881
6882 const size_t M( A.rows() );
6883 const size_t N( B.columns() );
6884
6885 for( size_t i=0UL; i<M; ++i )
6886 {
6887 const size_t jbegin( ( IsUpper_v<MT4> )
6888 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
6889 :( 0UL ) );
6890 const size_t jend( ( IsLower_v<MT4> )
6891 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
6892 :( N ) );
6893 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6894
6895 const size_t jnum( jend - jbegin );
6896 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
6897 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
6898
6899 for( size_t j=jbegin; j<jpos; j+=2UL ) {
6900 C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6901 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6902 }
6903 if( jpos < jend ) {
6904 C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6905 }
6906 }
6907 }
6908 //**********************************************************************************************
6909
6910 //**Default addition assignment to dense matrices (diagonal/general)****************************
6924 template< typename MT3 // Type of the left-hand side target matrix
6925 , typename MT4 // Type of the left-hand side matrix operand
6926 , typename MT5 // Type of the right-hand side matrix operand
6927 , typename ST2 > // Type of the scalar value
6928 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6929 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6930 {
6932
6933 const size_t M( A.rows() );
6934 const size_t N( B.columns() );
6935
6936 for( size_t i=0UL; i<M; ++i )
6937 {
6938 const size_t jbegin( ( IsUpper_v<MT5> )
6939 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
6940 :( 0UL ) );
6941 const size_t jend( ( IsLower_v<MT5> )
6942 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
6943 :( N ) );
6944 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6945
6946 const size_t jnum( jend - jbegin );
6947 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
6948 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
6949
6950 for( size_t j=jbegin; j<jpos; j+=2UL ) {
6951 C(i,j ) += A(i,i) * B(i,j ) * scalar;
6952 C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
6953 }
6954 if( jpos < jend ) {
6955 C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
6956 }
6957 }
6958 }
6959 //**********************************************************************************************
6960
6961 //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
6975 template< typename MT3 // Type of the left-hand side target matrix
6976 , typename MT4 // Type of the left-hand side matrix operand
6977 , typename MT5 // Type of the right-hand side matrix operand
6978 , typename ST2 > // Type of the scalar value
6979 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6980 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6981 {
6983
6984 for( size_t i=0UL; i<A.rows(); ++i ) {
6985 C(i,i) += A(i,i) * B(i,i) * scalar;
6986 }
6987 }
6988 //**********************************************************************************************
6989
6990 //**Default addition assignment to dense matrices (small matrices)******************************
7004 template< typename MT3 // Type of the left-hand side target matrix
7005 , typename MT4 // Type of the left-hand side matrix operand
7006 , typename MT5 // Type of the right-hand side matrix operand
7007 , typename ST2 > // Type of the scalar value
7008 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7009 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7010 {
7011 selectDefaultAddAssignKernel( C, A, B, scalar );
7012 }
7013 //**********************************************************************************************
7014
7015 //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
7030 template< typename MT3 // Type of the left-hand side target matrix
7031 , typename MT4 // Type of the left-hand side matrix operand
7032 , typename MT5 // Type of the right-hand side matrix operand
7033 , typename ST2 > // Type of the scalar value
7034 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7035 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7036 {
7037 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
7038
7039 const size_t M( A.rows() );
7040 const size_t N( B.columns() );
7041 const size_t K( A.columns() );
7042
7043 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7044
7045 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
7046 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
7047
7048 const SIMDType factor( set( scalar ) );
7049
7050 size_t j( 0UL );
7051
7052 if( IsIntegral_v<ElementType> )
7053 {
7054 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
7055 for( size_t i=0UL; i<M; ++i )
7056 {
7057 const size_t kbegin( ( IsUpper_v<MT4> )
7058 ?( ( IsLower_v<MT5> )
7059 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7060 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7061 :( IsLower_v<MT5> ? j : 0UL ) );
7062 const size_t kend( ( IsLower_v<MT4> )
7063 ?( ( IsUpper_v<MT5> )
7064 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
7065 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
7066 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
7067
7068 size_t k( kbegin );
7069
7070 if( k < kend )
7071 {
7072 SIMDType a1( set( A(i,k) ) );
7073 SIMDType xmm1( a1 * B.load(k,j ) );
7074 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
7075 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
7076 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
7077 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
7078 SIMDType xmm6( a1 * B.load(k,j+SIMDSIZE*5UL) );
7079 SIMDType xmm7( a1 * B.load(k,j+SIMDSIZE*6UL) );
7080 SIMDType xmm8( a1 * B.load(k,j+SIMDSIZE*7UL) );
7081
7082 for( ++k; k<kend; ++k ) {
7083 a1 = set( A(i,k) );
7084 xmm1 += a1 * B.load(k,j );
7085 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7086 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7087 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7088 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7089 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
7090 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
7091 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
7092 }
7093
7094 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7095 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
7096 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
7097 C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
7098 C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
7099 C.store( i, j+SIMDSIZE*5UL, C.load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
7100 C.store( i, j+SIMDSIZE*6UL, C.load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
7101 C.store( i, j+SIMDSIZE*7UL, C.load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
7102 }
7103 }
7104 }
7105 }
7106
7107 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
7108 {
7109 size_t i( 0UL );
7110
7111 for( ; (i+2UL) <= M; i+=2UL )
7112 {
7113 const size_t kbegin( ( IsUpper_v<MT4> )
7114 ?( ( IsLower_v<MT5> )
7115 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7116 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7117 :( IsLower_v<MT5> ? j : 0UL ) );
7118 const size_t kend( ( IsLower_v<MT4> )
7119 ?( ( IsUpper_v<MT5> )
7120 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
7121 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7122 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
7123
7124 size_t k( kbegin );
7125
7126 if( k < kend )
7127 {
7128 SIMDType a1( set( A(i ,k) ) );
7129 SIMDType a2( set( A(i+1UL,k) ) );
7130 SIMDType b1( B.load(k,j ) );
7131 SIMDType b2( B.load(k,j+SIMDSIZE ) );
7132 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7133 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7134 SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
7135 SIMDType xmm1 ( a1 * b1 );
7136 SIMDType xmm2 ( a1 * b2 );
7137 SIMDType xmm3 ( a1 * b3 );
7138 SIMDType xmm4 ( a1 * b4 );
7139 SIMDType xmm5 ( a1 * b5 );
7140 SIMDType xmm6 ( a2 * b1 );
7141 SIMDType xmm7 ( a2 * b2 );
7142 SIMDType xmm8 ( a2 * b3 );
7143 SIMDType xmm9 ( a2 * b4 );
7144 SIMDType xmm10( a2 * b5 );
7145
7146 for( ++k; k<kend; ++k ) {
7147 a1 = set( A(i ,k) );
7148 a2 = set( A(i+1UL,k) );
7149 b1 = B.load(k,j );
7150 b2 = B.load(k,j+SIMDSIZE );
7151 b3 = B.load(k,j+SIMDSIZE*2UL);
7152 b4 = B.load(k,j+SIMDSIZE*3UL);
7153 b5 = B.load(k,j+SIMDSIZE*4UL);
7154 xmm1 += a1 * b1;
7155 xmm2 += a1 * b2;
7156 xmm3 += a1 * b3;
7157 xmm4 += a1 * b4;
7158 xmm5 += a1 * b5;
7159 xmm6 += a2 * b1;
7160 xmm7 += a2 * b2;
7161 xmm8 += a2 * b3;
7162 xmm9 += a2 * b4;
7163 xmm10 += a2 * b5;
7164 }
7165
7166 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7167 C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
7168 C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
7169 C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
7170 C.store( i , j+SIMDSIZE*4UL, C.load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
7171 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm6 * factor );
7172 C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
7173 C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
7174 C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
7175 C.store( i+1UL, j+SIMDSIZE*4UL, C.load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
7176 }
7177 }
7178
7179 if( i < M )
7180 {
7181 const size_t kbegin( ( IsUpper_v<MT4> )
7182 ?( ( IsLower_v<MT5> )
7183 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7184 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7185 :( IsLower_v<MT5> ? j : 0UL ) );
7186 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
7187
7188 size_t k( kbegin );
7189
7190 if( k < kend )
7191 {
7192 SIMDType a1( set( A(i,k) ) );
7193 SIMDType xmm1( a1 * B.load(k,j ) );
7194 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
7195 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
7196 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
7197 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
7198
7199 for( ++k; k<kend; ++k ) {
7200 a1 = set( A(i,k) );
7201 xmm1 += a1 * B.load(k,j );
7202 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7203 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7204 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7205 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7206 }
7207
7208 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7209 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
7210 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
7211 C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
7212 C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
7213 }
7214 }
7215 }
7216
7217 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7218 {
7219 size_t i( 0UL );
7220
7221 for( ; (i+2UL) <= M; i+=2UL )
7222 {
7223 const size_t kbegin( ( IsUpper_v<MT4> )
7224 ?( ( IsLower_v<MT5> )
7225 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7226 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7227 :( IsLower_v<MT5> ? j : 0UL ) );
7228 const size_t kend( ( IsLower_v<MT4> )
7229 ?( ( IsUpper_v<MT5> )
7230 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
7231 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7232 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
7233
7234 size_t k( kbegin );
7235
7236 if( k < kend )
7237 {
7238 SIMDType a1( set( A(i ,k) ) );
7239 SIMDType a2( set( A(i+1UL,k) ) );
7240 SIMDType b1( B.load(k,j ) );
7241 SIMDType b2( B.load(k,j+SIMDSIZE ) );
7242 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7243 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7244 SIMDType xmm1( a1 * b1 );
7245 SIMDType xmm2( a1 * b2 );
7246 SIMDType xmm3( a1 * b3 );
7247 SIMDType xmm4( a1 * b4 );
7248 SIMDType xmm5( a2 * b1 );
7249 SIMDType xmm6( a2 * b2 );
7250 SIMDType xmm7( a2 * b3 );
7251 SIMDType xmm8( a2 * b4 );
7252
7253 for( ++k; k<kend; ++k ) {
7254 a1 = set( A(i ,k) );
7255 a2 = set( A(i+1UL,k) );
7256 b1 = B.load(k,j );
7257 b2 = B.load(k,j+SIMDSIZE );
7258 b3 = B.load(k,j+SIMDSIZE*2UL);
7259 b4 = B.load(k,j+SIMDSIZE*3UL);
7260 xmm1 += a1 * b1;
7261 xmm2 += a1 * b2;
7262 xmm3 += a1 * b3;
7263 xmm4 += a1 * b4;
7264 xmm5 += a2 * b1;
7265 xmm6 += a2 * b2;
7266 xmm7 += a2 * b3;
7267 xmm8 += a2 * b4;
7268 }
7269
7270 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7271 C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
7272 C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
7273 C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
7274 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm5 * factor );
7275 C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
7276 C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
7277 C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
7278 }
7279 }
7280
7281 if( i < M )
7282 {
7283 const size_t kbegin( ( IsUpper_v<MT4> )
7284 ?( ( IsLower_v<MT5> )
7285 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7286 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7287 :( IsLower_v<MT5> ? j : 0UL ) );
7288 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
7289
7290 size_t k( kbegin );
7291
7292 if( k < kend )
7293 {
7294 SIMDType a1( set( A(i,k) ) );
7295 SIMDType xmm1( a1 * B.load(k,j ) );
7296 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
7297 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
7298 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
7299
7300 for( ++k; k<kend; ++k ) {
7301 a1 = set( A(i,k) );
7302 xmm1 += a1 * B.load(k,j );
7303 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7304 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7305 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7306 }
7307
7308 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7309 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
7310 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
7311 C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
7312 }
7313 }
7314 }
7315
7316 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
7317 {
7318 size_t i( 0UL );
7319
7320 for( ; (i+2UL) <= M; i+=2UL )
7321 {
7322 const size_t kbegin( ( IsUpper_v<MT4> )
7323 ?( ( IsLower_v<MT5> )
7324 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7325 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7326 :( IsLower_v<MT5> ? j : 0UL ) );
7327 const size_t kend( ( IsLower_v<MT4> )
7328 ?( ( IsUpper_v<MT5> )
7329 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
7330 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7331 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
7332
7333 size_t k( kbegin );
7334
7335 if( k < kend )
7336 {
7337 SIMDType a1( set( A(i ,k) ) );
7338 SIMDType a2( set( A(i+1UL,k) ) );
7339 SIMDType b1( B.load(k,j ) );
7340 SIMDType b2( B.load(k,j+SIMDSIZE ) );
7341 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7342 SIMDType xmm1( a1 * b1 );
7343 SIMDType xmm2( a1 * b2 );
7344 SIMDType xmm3( a1 * b3 );
7345 SIMDType xmm4( a2 * b1 );
7346 SIMDType xmm5( a2 * b2 );
7347 SIMDType xmm6( a2 * b3 );
7348
7349 for( ++k; k<kend; ++k ) {
7350 a1 = set( A(i ,k) );
7351 a2 = set( A(i+1UL,k) );
7352 b1 = B.load(k,j );
7353 b2 = B.load(k,j+SIMDSIZE );
7354 b3 = B.load(k,j+SIMDSIZE*2UL);
7355 xmm1 += a1 * b1;
7356 xmm2 += a1 * b2;
7357 xmm3 += a1 * b3;
7358 xmm4 += a2 * b1;
7359 xmm5 += a2 * b2;
7360 xmm6 += a2 * b3;
7361 }
7362
7363 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7364 C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
7365 C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
7366 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm4 * factor );
7367 C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
7368 C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
7369 }
7370 }
7371
7372 if( i < M )
7373 {
7374 const size_t kbegin( ( IsUpper_v<MT4> )
7375 ?( ( IsLower_v<MT5> )
7376 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7377 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7378 :( IsLower_v<MT5> ? j : 0UL ) );
7379 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
7380
7381 size_t k( kbegin );
7382
7383 if( k < kend )
7384 {
7385 SIMDType a1( set( A(i,k) ) );
7386 SIMDType xmm1( a1 * B.load(k,j ) );
7387 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
7388 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
7389
7390 for( ++k; k<kend; ++k ) {
7391 a1 = set( A(i,k) );
7392 xmm1 += a1 * B.load(k,j );
7393 xmm2 += a1 * B.load(k,j+SIMDSIZE );
7394 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7395 }
7396
7397 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7398 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
7399 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
7400 }
7401 }
7402 }
7403
7404 for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
7405 {
7406 const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
7407 size_t i( LOW ? j : 0UL );
7408
7409 for( ; (i+4UL) <= iend; i+=4UL )
7410 {
7411 const size_t kbegin( ( IsUpper_v<MT4> )
7412 ?( ( IsLower_v<MT5> )
7413 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7414 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7415 :( IsLower_v<MT5> ? j : 0UL ) );
7416 const size_t kend( ( IsLower_v<MT4> )
7417 ?( ( IsUpper_v<MT5> )
7418 ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
7419 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
7420 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
7421
7422 size_t k( kbegin );
7423
7424 if( k < kend )
7425 {
7426 SIMDType a1( set( A(i ,k) ) );
7427 SIMDType a2( set( A(i+1UL,k) ) );
7428 SIMDType a3( set( A(i+2UL,k) ) );
7429 SIMDType a4( set( A(i+3UL,k) ) );
7430 SIMDType b1( B.load(k,j ) );
7431 SIMDType b2( B.load(k,j+SIMDSIZE) );
7432 SIMDType xmm1( a1 * b1 );
7433 SIMDType xmm2( a1 * b2 );
7434 SIMDType xmm3( a2 * b1 );
7435 SIMDType xmm4( a2 * b2 );
7436 SIMDType xmm5( a3 * b1 );
7437 SIMDType xmm6( a3 * b2 );
7438 SIMDType xmm7( a4 * b1 );
7439 SIMDType xmm8( a4 * b2 );
7440
7441 for( ++k; k<kend; ++k ) {
7442 a1 = set( A(i ,k) );
7443 a2 = set( A(i+1UL,k) );
7444 a3 = set( A(i+2UL,k) );
7445 a4 = set( A(i+3UL,k) );
7446 b1 = B.load(k,j );
7447 b2 = B.load(k,j+SIMDSIZE);
7448 xmm1 += a1 * b1;
7449 xmm2 += a1 * b2;
7450 xmm3 += a2 * b1;
7451 xmm4 += a2 * b2;
7452 xmm5 += a3 * b1;
7453 xmm6 += a3 * b2;
7454 xmm7 += a4 * b1;
7455 xmm8 += a4 * b2;
7456 }
7457
7458 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7459 C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
7460 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
7461 C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
7462 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
7463 C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
7464 C.store( i+3UL, j , C.load(i+3UL,j ) + xmm7 * factor );
7465 C.store( i+3UL, j+SIMDSIZE, C.load(i+3UL,j+SIMDSIZE) + xmm8 * factor );
7466 }
7467 }
7468
7469 for( ; (i+3UL) <= iend; i+=3UL )
7470 {
7471 const size_t kbegin( ( IsUpper_v<MT4> )
7472 ?( ( IsLower_v<MT5> )
7473 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7474 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7475 :( IsLower_v<MT5> ? j : 0UL ) );
7476 const size_t kend( ( IsLower_v<MT4> )
7477 ?( ( IsUpper_v<MT5> )
7478 ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
7479 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
7480 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
7481
7482 size_t k( kbegin );
7483
7484 if( k < kend )
7485 {
7486 SIMDType a1( set( A(i ,k) ) );
7487 SIMDType a2( set( A(i+1UL,k) ) );
7488 SIMDType a3( set( A(i+2UL,k) ) );
7489 SIMDType b1( B.load(k,j ) );
7490 SIMDType b2( B.load(k,j+SIMDSIZE) );
7491 SIMDType xmm1( a1 * b1 );
7492 SIMDType xmm2( a1 * b2 );
7493 SIMDType xmm3( a2 * b1 );
7494 SIMDType xmm4( a2 * b2 );
7495 SIMDType xmm5( a3 * b1 );
7496 SIMDType xmm6( a3 * b2 );
7497
7498 for( ++k; k<kend; ++k ) {
7499 a1 = set( A(i ,k) );
7500 a2 = set( A(i+1UL,k) );
7501 a3 = set( A(i+2UL,k) );
7502 b1 = B.load(k,j );
7503 b2 = B.load(k,j+SIMDSIZE);
7504 xmm1 += a1 * b1;
7505 xmm2 += a1 * b2;
7506 xmm3 += a2 * b1;
7507 xmm4 += a2 * b2;
7508 xmm5 += a3 * b1;
7509 xmm6 += a3 * b2;
7510 }
7511
7512 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7513 C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
7514 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
7515 C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
7516 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
7517 C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
7518 }
7519 }
7520
7521 for( ; (i+2UL) <= iend; i+=2UL )
7522 {
7523 const size_t kbegin( ( IsUpper_v<MT4> )
7524 ?( ( IsLower_v<MT5> )
7525 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7526 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7527 :( IsLower_v<MT5> ? j : 0UL ) );
7528 const size_t kend( ( IsLower_v<MT4> )
7529 ?( ( IsUpper_v<MT5> )
7530 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
7531 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7532 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
7533
7534 size_t k( kbegin );
7535
7536 if( k < kend )
7537 {
7538 SIMDType a1( set( A(i ,k) ) );
7539 SIMDType a2( set( A(i+1UL,k) ) );
7540 SIMDType b1( B.load(k,j ) );
7541 SIMDType b2( B.load(k,j+SIMDSIZE) );
7542 SIMDType xmm1( a1 * b1 );
7543 SIMDType xmm2( a1 * b2 );
7544 SIMDType xmm3( a2 * b1 );
7545 SIMDType xmm4( a2 * b2 );
7546
7547 for( ++k; k<kend; ++k ) {
7548 a1 = set( A(i ,k) );
7549 a2 = set( A(i+1UL,k) );
7550 b1 = B.load(k,j );
7551 b2 = B.load(k,j+SIMDSIZE);
7552 xmm1 += a1 * b1;
7553 xmm2 += a1 * b2;
7554 xmm3 += a2 * b1;
7555 xmm4 += a2 * b2;
7556 }
7557
7558 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7559 C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
7560 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
7561 C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
7562 }
7563 }
7564
7565 if( i < iend )
7566 {
7567 const size_t kbegin( ( IsUpper_v<MT4> )
7568 ?( ( IsLower_v<MT5> )
7569 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7570 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7571 :( IsLower_v<MT5> ? j : 0UL ) );
7572 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
7573
7574 size_t k( kbegin );
7575
7576 if( k < kend )
7577 {
7578 SIMDType a1( set( A(i,k) ) );
7579 SIMDType xmm1( a1 * B.load(k,j ) );
7580 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE) );
7581
7582 for( ++k; k<kend; ++k ) {
7583 a1 = set( A(i,k) );
7584 xmm1 += a1 * B.load(k,j );
7585 xmm2 += a1 * B.load(k,j+SIMDSIZE);
7586 }
7587
7588 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7589 C.store( i, j+SIMDSIZE, C.load(i,j+SIMDSIZE) + xmm2 * factor );
7590 }
7591 }
7592 }
7593
7594 for( ; j<jpos; j+=SIMDSIZE )
7595 {
7596 const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
7597 size_t i( LOW ? j : 0UL );
7598
7599 for( ; (i+4UL) <= iend; i+=4UL )
7600 {
7601 const size_t kbegin( ( IsUpper_v<MT4> )
7602 ?( ( IsLower_v<MT5> )
7603 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7604 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7605 :( IsLower_v<MT5> ? j : 0UL ) );
7606 const size_t kend( ( IsLower_v<MT4> )
7607 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
7608 :( K ) );
7609
7610 size_t k( kbegin );
7611
7612 if( k < kend )
7613 {
7614 SIMDType b1( B.load(k,j) );
7615 SIMDType xmm1( set( A(i ,k) ) * b1 );
7616 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
7617 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
7618 SIMDType xmm4( set( A(i+3UL,k) ) * b1 );
7619
7620 for( ++k; k<kend; ++k ) {
7621 b1 = B.load(k,j);
7622 xmm1 += set( A(i ,k) ) * b1;
7623 xmm2 += set( A(i+1UL,k) ) * b1;
7624 xmm3 += set( A(i+2UL,k) ) * b1;
7625 xmm4 += set( A(i+3UL,k) ) * b1;
7626 }
7627
7628 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7629 C.store( i+1UL, j, C.load(i+1UL,j) + xmm2 * factor );
7630 C.store( i+2UL, j, C.load(i+2UL,j) + xmm3 * factor );
7631 C.store( i+3UL, j, C.load(i+3UL,j) + xmm4 * factor );
7632 }
7633 }
7634
7635 for( ; (i+3UL) <= iend; i+=3UL )
7636 {
7637 const size_t kbegin( ( IsUpper_v<MT4> )
7638 ?( ( IsLower_v<MT5> )
7639 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7640 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7641 :( IsLower_v<MT5> ? j : 0UL ) );
7642 const size_t kend( ( IsLower_v<MT4> )
7643 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
7644 :( K ) );
7645
7646 size_t k( kbegin );
7647
7648 if( k < kend )
7649 {
7650 SIMDType b1( B.load(k,j) );
7651 SIMDType xmm1( set( A(i ,k) ) * b1 );
7652 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
7653 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
7654
7655 for( ++k; k<kend; ++k ) {
7656 b1 = B.load(k,j);
7657 xmm1 += set( A(i ,k) ) * b1;
7658 xmm2 += set( A(i+1UL,k) ) * b1;
7659 xmm3 += set( A(i+2UL,k) ) * b1;
7660 }
7661
7662 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7663 C.store( i+1UL, j, C.load(i+1UL,j) + xmm2 * factor );
7664 C.store( i+2UL, j, C.load(i+2UL,j) + xmm3 * factor );
7665 }
7666 }
7667
7668 for( ; (i+2UL) <= iend; i+=2UL )
7669 {
7670 const size_t kbegin( ( IsUpper_v<MT4> )
7671 ?( ( IsLower_v<MT5> )
7672 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7673 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7674 :( IsLower_v<MT5> ? j : 0UL ) );
7675 const size_t kend( ( IsLower_v<MT4> )
7676 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
7677 :( K ) );
7678
7679 size_t k( kbegin );
7680
7681 if( k < kend )
7682 {
7683 SIMDType b1( B.load(k,j) );
7684 SIMDType xmm1( set( A(i ,k) ) * b1 );
7685 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
7686
7687 for( ++k; k<kend; ++k ) {
7688 b1 = B.load(k,j);
7689 xmm1 += set( A(i ,k) ) * b1;
7690 xmm2 += set( A(i+1UL,k) ) * b1;
7691 }
7692
7693 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7694 C.store( i+1UL, j, C.load(i+1UL,j) + xmm2 * factor );
7695 }
7696 }
7697
7698 if( i < iend )
7699 {
7700 const size_t kbegin( ( IsUpper_v<MT4> )
7701 ?( ( IsLower_v<MT5> )
7702 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7703 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7704 :( IsLower_v<MT5> ? j : 0UL ) );
7705
7706 size_t k( kbegin );
7707
7708 if( k < K )
7709 {
7710 SIMDType xmm1( set( A(i,k) ) * B.load(k,j) );
7711
7712 for( ++k; k<K; ++k ) {
7713 xmm1 += set( A(i,k) ) * B.load(k,j);
7714 }
7715
7716 C.store( i, j, C.load(i,j) + xmm1 * factor );
7717 }
7718 }
7719 }
7720
7721 for( ; remainder && j<N; ++j )
7722 {
7723 const size_t iend( UPP ? j+1UL : M );
7724 size_t i( LOW ? j : 0UL );
7725
7726 for( ; (i+2UL) <= iend; i+=2UL )
7727 {
7728 const size_t kbegin( ( IsUpper_v<MT4> )
7729 ?( ( IsLower_v<MT5> )
7730 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7731 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7732 :( IsLower_v<MT5> ? j : 0UL ) );
7733 const size_t kend( ( IsLower_v<MT4> )
7734 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
7735 :( K ) );
7736
7737 size_t k( kbegin );
7738
7739 if( k < kend )
7740 {
7741 ElementType value1( A(i ,k) * B(k,j) );
7742 ElementType value2( A(i+1UL,k) * B(k,j) );
7743
7744 for( ++k; k<kend; ++k ) {
7745 value1 += A(i ,k) * B(k,j);
7746 value2 += A(i+1UL,k) * B(k,j);
7747 }
7748
7749 C(i ,j) += value1 * scalar;
7750 C(i+1UL,j) += value2 * scalar;
7751 }
7752 }
7753
7754 if( i < iend )
7755 {
7756 const size_t kbegin( ( IsUpper_v<MT4> )
7757 ?( ( IsLower_v<MT5> )
7758 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7759 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7760 :( IsLower_v<MT5> ? j : 0UL ) );
7761
7762 size_t k( kbegin );
7763
7764 if( k < K )
7765 {
7766 ElementType value( A(i,k) * B(k,j) );
7767
7768 for( ++k; k<K; ++k ) {
7769 value += A(i,k) * B(k,j);
7770 }
7771
7772 C(i,j) += value * scalar;
7773 }
7774 }
7775 }
7776 }
7777 //**********************************************************************************************
7778
7779 //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
7794 template< typename MT3 // Type of the left-hand side target matrix
7795 , typename MT4 // Type of the left-hand side matrix operand
7796 , typename MT5 // Type of the right-hand side matrix operand
7797 , typename ST2 > // Type of the scalar value
7798 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7799 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7800 {
7805
7806 const ForwardFunctor fwd;
7807
7808 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
7809 const OppositeType_t<MT4> tmp( serial( A ) );
7810 addAssign( C, fwd( tmp * B ) * scalar );
7811 }
7812 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
7813 const OppositeType_t<MT5> tmp( serial( B ) );
7814 addAssign( C, fwd( A * tmp ) * scalar );
7815 }
7816 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
7817 const OppositeType_t<MT4> tmp( serial( A ) );
7818 addAssign( C, fwd( tmp * B ) * scalar );
7819 }
7820 else {
7821 const OppositeType_t<MT5> tmp( serial( B ) );
7822 addAssign( C, fwd( A * tmp ) * scalar );
7823 }
7824 }
7825 //**********************************************************************************************
7826
7827 //**Default addition assignment to dense matrices (large matrices)******************************
7841 template< typename MT3 // Type of the left-hand side target matrix
7842 , typename MT4 // Type of the left-hand side matrix operand
7843 , typename MT5 // Type of the right-hand side matrix operand
7844 , typename ST2 > // Type of the scalar value
7845 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7846 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7847 {
7848 selectDefaultAddAssignKernel( C, A, B, scalar );
7849 }
7850 //**********************************************************************************************
7851
7852 //**Vectorized default addition assignment to dense matrices (large matrices)*******************
7867 template< typename MT3 // Type of the left-hand side target matrix
7868 , typename MT4 // Type of the left-hand side matrix operand
7869 , typename MT5 // Type of the right-hand side matrix operand
7870 , typename ST2 > // Type of the scalar value
7871 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7872 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7873 {
7874 if( LOW )
7875 lmmm( C, A, B, scalar, ST2(1) );
7876 else if( UPP )
7877 ummm( C, A, B, scalar, ST2(1) );
7878 else
7879 mmm( C, A, B, scalar, ST2(1) );
7880 }
7881 //**********************************************************************************************
7882
7883 //**BLAS-based addition assignment to dense matrices (default)**********************************
7897 template< typename MT3 // Type of the left-hand side target matrix
7898 , typename MT4 // Type of the left-hand side matrix operand
7899 , typename MT5 // Type of the right-hand side matrix operand
7900 , typename ST2 > // Type of the scalar value
7901 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7902 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7903 {
7904 selectLargeAddAssignKernel( C, A, B, scalar );
7905 }
7906 //**********************************************************************************************
7907
7908 //**BLAS-based addition assignment to dense matrices********************************************
7909#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7923 template< typename MT3 // Type of the left-hand side target matrix
7924 , typename MT4 // Type of the left-hand side matrix operand
7925 , typename MT5 // Type of the right-hand side matrix operand
7926 , typename ST2 > // Type of the scalar value
7927 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7928 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7929 {
7930 using ET = ElementType_t<MT3>;
7931
7932 if( IsTriangular_v<MT4> ) {
7933 ResultType_t<MT3> tmp( serial( B ) );
7934 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7935 addAssign( C, tmp );
7936 }
7937 else if( IsTriangular_v<MT5> ) {
7938 ResultType_t<MT3> tmp( serial( A ) );
7939 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7940 addAssign( C, tmp );
7941 }
7942 else {
7943 gemm( C, A, B, ET(scalar), ET(1) );
7944 }
7945 }
7946#endif
7947 //**********************************************************************************************
7948
7949 //**Restructuring addition assignment to column-major matrices**********************************
7963 template< typename MT > // Type of the target matrix
7964 friend inline auto addAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
7965 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7966 {
7968
7970
7971 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
7972 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
7973
7974 const ForwardFunctor fwd;
7975
7976 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
7977 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
7978
7979 addAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
7980 }
7981 //**********************************************************************************************
7982
7983 //**Addition assignment to sparse matrices******************************************************
7984 // No special implementation for the addition assignment to sparse matrices.
7985 //**********************************************************************************************
7986
7987 //**Subtraction assignment to dense matrices****************************************************
7999 template< typename MT // Type of the target dense matrix
8000 , bool SO > // Storage order of the target dense matrix
8001 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8002 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8003 {
8005
8006 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
8007 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
8008
8009 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8010 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8011
8012 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
8013 return;
8014 }
8015
8016 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
8017 RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
8018
8019 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8020 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8021 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8022 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8023 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
8024 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
8025
8026 DMatScalarMultExpr::selectSubAssignKernel( *lhs, A, B, rhs.scalar_ );
8027 }
8028 //**********************************************************************************************
8029
8030 //**Subtraction assignment to dense matrices (kernel selection)*********************************
8041 template< typename MT3 // Type of the left-hand side target matrix
8042 , typename MT4 // Type of the left-hand side matrix operand
8043 , typename MT5 // Type of the right-hand side matrix operand
8044 , typename ST2 > // Type of the scalar value
8045 static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8046 {
8047 if( ( IsDiagonal_v<MT5> ) ||
8048 ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
8049 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
8050 selectSmallSubAssignKernel( C, A, B, scalar );
8051 else
8052 selectBlasSubAssignKernel( C, A, B, scalar );
8053 }
8054 //**********************************************************************************************
8055
8056 //**Default subtraction assignment to dense matrices (general/general)**************************
8070 template< typename MT3 // Type of the left-hand side target matrix
8071 , typename MT4 // Type of the left-hand side matrix operand
8072 , typename MT5 // Type of the right-hand side matrix operand
8073 , typename ST2 > // Type of the scalar value
8074 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8075 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
8076 {
8077 const ResultType tmp( serial( A * B * scalar ) );
8078 subAssign( C, tmp );
8079 }
8080 //**********************************************************************************************
8081
8082 //**Default subtraction assignment to dense matrices (general/diagonal)*************************
8096 template< typename MT3 // Type of the left-hand side target matrix
8097 , typename MT4 // Type of the left-hand side matrix operand
8098 , typename MT5 // Type of the right-hand side matrix operand
8099 , typename ST2 > // Type of the scalar value
8100 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8101 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
8102 {
8104
8105 const size_t M( A.rows() );
8106 const size_t N( B.columns() );
8107
8108 for( size_t i=0UL; i<M; ++i )
8109 {
8110 const size_t jbegin( ( IsUpper_v<MT4> )
8111 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
8112 :( 0UL ) );
8113 const size_t jend( ( IsLower_v<MT4> )
8114 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
8115 :( N ) );
8116 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
8117
8118 const size_t jnum( jend - jbegin );
8119 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
8120 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
8121
8122 for( size_t j=jbegin; j<jpos; j+=2UL ) {
8123 C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
8124 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
8125 }
8126 if( jpos < jend ) {
8127 C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
8128 }
8129 }
8130 }
8131 //**********************************************************************************************
8132
8133 //**Default subtraction assignment to dense matrices (diagonal/general)*************************
8147 template< typename MT3 // Type of the left-hand side target matrix
8148 , typename MT4 // Type of the left-hand side matrix operand
8149 , typename MT5 // Type of the right-hand side matrix operand
8150 , typename ST2 > // Type of the scalar value
8151 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8152 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
8153 {
8155
8156 const size_t M( A.rows() );
8157 const size_t N( B.columns() );
8158
8159 for( size_t i=0UL; i<M; ++i )
8160 {
8161 const size_t jbegin( ( IsUpper_v<MT5> )
8162 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
8163 :( 0UL ) );
8164 const size_t jend( ( IsLower_v<MT5> )
8165 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
8166 :( N ) );
8167 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
8168
8169 const size_t jnum( jend - jbegin );
8170 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
8171 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
8172
8173 for( size_t j=jbegin; j<jpos; j+=2UL ) {
8174 C(i,j ) -= A(i,i) * B(i,j ) * scalar;
8175 C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
8176 }
8177 if( jpos < jend ) {
8178 C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
8179 }
8180 }
8181 }
8182 //**********************************************************************************************
8183
8184 //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
8198 template< typename MT3 // Type of the left-hand side target matrix
8199 , typename MT4 // Type of the left-hand side matrix operand
8200 , typename MT5 // Type of the right-hand side matrix operand
8201 , typename ST2 > // Type of the scalar value
8202 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8203 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
8204 {
8206
8207 for( size_t i=0UL; i<A.rows(); ++i ) {
8208 C(i,i) -= A(i,i) * B(i,i) * scalar;
8209 }
8210 }
8211 //**********************************************************************************************
8212
8213 //**Default subtraction assignment to dense matrices (small matrices)***************************
8227 template< typename MT3 // Type of the left-hand side target matrix
8228 , typename MT4 // Type of the left-hand side matrix operand
8229 , typename MT5 // Type of the right-hand side matrix operand
8230 , typename ST2 > // Type of the scalar value
8231 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8232 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8233 {
8234 selectDefaultSubAssignKernel( C, A, B, scalar );
8235 }
8236 //**********************************************************************************************
8237
8238 //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
8253 template< typename MT3 // Type of the left-hand side target matrix
8254 , typename MT4 // Type of the left-hand side matrix operand
8255 , typename MT5 // Type of the right-hand side matrix operand
8256 , typename ST2 > // Type of the scalar value
8257 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8258 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8259 {
8260 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
8261
8262 const size_t M( A.rows() );
8263 const size_t N( B.columns() );
8264 const size_t K( A.columns() );
8265
8266 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
8267
8268 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
8269 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
8270
8271 const SIMDType factor( set( scalar ) );
8272
8273 size_t j( 0UL );
8274
8275 if( IsIntegral_v<ElementType> )
8276 {
8277 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
8278 for( size_t i=0UL; i<M; ++i )
8279 {
8280 const size_t kbegin( ( IsUpper_v<MT4> )
8281 ?( ( IsLower_v<MT5> )
8282 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8283 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8284 :( IsLower_v<MT5> ? j : 0UL ) );
8285 const size_t kend( ( IsLower_v<MT4> )
8286 ?( ( IsUpper_v<MT5> )
8287 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
8288 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
8289 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
8290
8291 size_t k( kbegin );
8292
8293 if( k < kend )
8294 {
8295 SIMDType a1( set( A(i,k) ) );
8296 SIMDType xmm1( a1 * B.load(k,j ) );
8297 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
8298 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
8299 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
8300 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
8301 SIMDType xmm6( a1 * B.load(k,j+SIMDSIZE*5UL) );
8302 SIMDType xmm7( a1 * B.load(k,j+SIMDSIZE*6UL) );
8303 SIMDType xmm8( a1 * B.load(k,j+SIMDSIZE*7UL) );
8304
8305 for( ++k; k<kend; ++k ) {
8306 a1 = set( A(i,k) );
8307 xmm1 += a1 * B.load(k,j );
8308 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8309 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8310 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8311 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
8312 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
8313 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
8314 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
8315 }
8316
8317 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8318 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
8319 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
8320 C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
8321 C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
8322 C.store( i, j+SIMDSIZE*5UL, C.load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
8323 C.store( i, j+SIMDSIZE*6UL, C.load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
8324 C.store( i, j+SIMDSIZE*7UL, C.load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
8325 }
8326 }
8327 }
8328 }
8329
8330 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
8331 {
8332 size_t i( 0UL );
8333
8334 for( ; (i+2UL) <= M; i+=2UL )
8335 {
8336 const size_t kbegin( ( IsUpper_v<MT4> )
8337 ?( ( IsLower_v<MT5> )
8338 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8339 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8340 :( IsLower_v<MT5> ? j : 0UL ) );
8341 const size_t kend( ( IsLower_v<MT4> )
8342 ?( ( IsUpper_v<MT5> )
8343 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
8344 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8345 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
8346
8347 size_t k( kbegin );
8348
8349 if( k < kend )
8350 {
8351 SIMDType a1( set( A(i ,k) ) );
8352 SIMDType a2( set( A(i+1UL,k) ) );
8353 SIMDType b1( B.load(k,j ) );
8354 SIMDType b2( B.load(k,j+SIMDSIZE ) );
8355 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8356 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
8357 SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
8358 SIMDType xmm1 ( a1 * b1 );
8359 SIMDType xmm2 ( a1 * b2 );
8360 SIMDType xmm3 ( a1 * b3 );
8361 SIMDType xmm4 ( a1 * b4 );
8362 SIMDType xmm5 ( a1 * b5 );
8363 SIMDType xmm6 ( a2 * b1 );
8364 SIMDType xmm7 ( a2 * b2 );
8365 SIMDType xmm8 ( a2 * b3 );
8366 SIMDType xmm9 ( a2 * b4 );
8367 SIMDType xmm10( a2 * b5 );
8368
8369 for( ++k; k<kend; ++k ) {
8370 a1 = set( A(i ,k) );
8371 a2 = set( A(i+1UL,k) );
8372 b1 = B.load(k,j );
8373 b2 = B.load(k,j+SIMDSIZE );
8374 b3 = B.load(k,j+SIMDSIZE*2UL);
8375 b4 = B.load(k,j+SIMDSIZE*3UL);
8376 b5 = B.load(k,j+SIMDSIZE*4UL);
8377 xmm1 += a1 * b1;
8378 xmm2 += a1 * b2;
8379 xmm3 += a1 * b3;
8380 xmm4 += a1 * b4;
8381 xmm5 += a1 * b5;
8382 xmm6 += a2 * b1;
8383 xmm7 += a2 * b2;
8384 xmm8 += a2 * b3;
8385 xmm9 += a2 * b4;
8386 xmm10 += a2 * b5;
8387 }
8388
8389 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8390 C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
8391 C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
8392 C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
8393 C.store( i , j+SIMDSIZE*4UL, C.load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
8394 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm6 * factor );
8395 C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
8396 C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
8397 C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
8398 C.store( i+1UL, j+SIMDSIZE*4UL, C.load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
8399 }
8400 }
8401
8402 if( i < M )
8403 {
8404 const size_t kbegin( ( IsUpper_v<MT4> )
8405 ?( ( IsLower_v<MT5> )
8406 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8407 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8408 :( IsLower_v<MT5> ? j : 0UL ) );
8409 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
8410
8411 size_t k( kbegin );
8412
8413 if( k < kend )
8414 {
8415 SIMDType a1( set( A(i,k) ) );
8416 SIMDType xmm1( a1 * B.load(k,j ) );
8417 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
8418 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
8419 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
8420 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
8421
8422 for( ++k; k<kend; ++k ) {
8423 a1 = set( A(i,k) );
8424 xmm1 += a1 * B.load(k,j );
8425 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8426 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8427 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8428 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
8429 }
8430
8431 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8432 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
8433 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
8434 C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
8435 C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
8436 }
8437 }
8438 }
8439
8440 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
8441 {
8442 size_t i( 0UL );
8443
8444 for( ; (i+2UL) <= M; i+=2UL )
8445 {
8446 const size_t kbegin( ( IsUpper_v<MT4> )
8447 ?( ( IsLower_v<MT5> )
8448 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8449 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8450 :( IsLower_v<MT5> ? j : 0UL ) );
8451 const size_t kend( ( IsLower_v<MT4> )
8452 ?( ( IsUpper_v<MT5> )
8453 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
8454 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8455 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
8456
8457 size_t k( kbegin );
8458
8459 if( k < kend )
8460 {
8461 SIMDType a1( set( A(i ,k) ) );
8462 SIMDType a2( set( A(i+1UL,k) ) );
8463 SIMDType b1( B.load(k,j ) );
8464 SIMDType b2( B.load(k,j+SIMDSIZE ) );
8465 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8466 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
8467 SIMDType xmm1( a1 * b1 );
8468 SIMDType xmm2( a1 * b2 );
8469 SIMDType xmm3( a1 * b3 );
8470 SIMDType xmm4( a1 * b4 );
8471 SIMDType xmm5( a2 * b1 );
8472 SIMDType xmm6( a2 * b2 );
8473 SIMDType xmm7( a2 * b3 );
8474 SIMDType xmm8( a2 * b4 );
8475
8476 for( ++k; k<kend; ++k ) {
8477 a1 = set( A(i ,k) );
8478 a2 = set( A(i+1UL,k) );
8479 b1 = B.load(k,j );
8480 b2 = B.load(k,j+SIMDSIZE );
8481 b3 = B.load(k,j+SIMDSIZE*2UL);
8482 b4 = B.load(k,j+SIMDSIZE*3UL);
8483 xmm1 += a1 * b1;
8484 xmm2 += a1 * b2;
8485 xmm3 += a1 * b3;
8486 xmm4 += a1 * b4;
8487 xmm5 += a2 * b1;
8488 xmm6 += a2 * b2;
8489 xmm7 += a2 * b3;
8490 xmm8 += a2 * b4;
8491 }
8492
8493 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8494 C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
8495 C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
8496 C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
8497 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm5 * factor );
8498 C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
8499 C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
8500 C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
8501 }
8502 }
8503
8504 if( i < M )
8505 {
8506 const size_t kbegin( ( IsUpper_v<MT4> )
8507 ?( ( IsLower_v<MT5> )
8508 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8509 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8510 :( IsLower_v<MT5> ? j : 0UL ) );
8511 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
8512
8513 size_t k( kbegin );
8514
8515 if( k < kend )
8516 {
8517 SIMDType a1( set( A(i,k) ) );
8518 SIMDType xmm1( a1 * B.load(k,j ) );
8519 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
8520 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
8521 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
8522
8523 for( ++k; k<kend; ++k ) {
8524 a1 = set( A(i,k) );
8525 xmm1 += a1 * B.load(k,j );
8526 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8527 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8528 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8529 }
8530
8531 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8532 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
8533 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
8534 C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
8535 }
8536 }
8537 }
8538
8539 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
8540 {
8541 size_t i( 0UL );
8542
8543 for( ; (i+2UL) <= M; i+=2UL )
8544 {
8545 const size_t kbegin( ( IsUpper_v<MT4> )
8546 ?( ( IsLower_v<MT5> )
8547 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8548 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8549 :( IsLower_v<MT5> ? j : 0UL ) );
8550 const size_t kend( ( IsLower_v<MT4> )
8551 ?( ( IsUpper_v<MT5> )
8552 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
8553 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8554 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
8555
8556 size_t k( kbegin );
8557
8558 if( k < kend )
8559 {
8560 SIMDType a1( set( A(i ,k) ) );
8561 SIMDType a2( set( A(i+1UL,k) ) );
8562 SIMDType b1( B.load(k,j ) );
8563 SIMDType b2( B.load(k,j+SIMDSIZE ) );
8564 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8565 SIMDType xmm1( a1 * b1 );
8566 SIMDType xmm2( a1 * b2 );
8567 SIMDType xmm3( a1 * b3 );
8568 SIMDType xmm4( a2 * b1 );
8569 SIMDType xmm5( a2 * b2 );
8570 SIMDType xmm6( a2 * b3 );
8571
8572 for( ++k; k<kend; ++k ) {
8573 a1 = set( A(i ,k) );
8574 a2 = set( A(i+1UL,k) );
8575 b1 = B.load(k,j );
8576 b2 = B.load(k,j+SIMDSIZE );
8577 b3 = B.load(k,j+SIMDSIZE*2UL);
8578 xmm1 += a1 * b1;
8579 xmm2 += a1 * b2;
8580 xmm3 += a1 * b3;
8581 xmm4 += a2 * b1;
8582 xmm5 += a2 * b2;
8583 xmm6 += a2 * b3;
8584 }
8585
8586 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8587 C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
8588 C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
8589 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm4 * factor );
8590 C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
8591 C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
8592 }
8593 }
8594
8595 if( i < M )
8596 {
8597 const size_t kbegin( ( IsUpper_v<MT4> )
8598 ?( ( IsLower_v<MT5> )
8599 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8600 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8601 :( IsLower_v<MT5> ? j : 0UL ) );
8602 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
8603
8604 size_t k( kbegin );
8605
8606 if( k < kend )
8607 {
8608 SIMDType a1( set( A(i,k) ) );
8609 SIMDType xmm1( a1 * B.load(k,j ) );
8610 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
8611 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
8612
8613 for( ++k; k<kend; ++k ) {
8614 a1 = set( A(i,k) );
8615 xmm1 += a1 * B.load(k,j );
8616 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8617 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8618 }
8619
8620 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8621 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
8622 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
8623 }
8624 }
8625 }
8626
8627 for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8628 {
8629 const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
8630 size_t i( LOW ? j : 0UL );
8631
8632 for( ; (i+4UL) <= iend; i+=4UL )
8633 {
8634 const size_t kbegin( ( IsUpper_v<MT4> )
8635 ?( ( IsLower_v<MT5> )
8636 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8637 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8638 :( IsLower_v<MT5> ? j : 0UL ) );
8639 const size_t kend( ( IsLower_v<MT4> )
8640 ?( ( IsUpper_v<MT5> )
8641 ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
8642 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
8643 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8644
8645 size_t k( kbegin );
8646
8647 if( k < kend )
8648 {
8649 SIMDType a1( set( A(i ,k) ) );
8650 SIMDType a2( set( A(i+1UL,k) ) );
8651 SIMDType a3( set( A(i+2UL,k) ) );
8652 SIMDType a4( set( A(i+3UL,k) ) );
8653 SIMDType b1( B.load(k,j ) );
8654 SIMDType b2( B.load(k,j+SIMDSIZE) );
8655 SIMDType xmm1( a1 * b1 );
8656 SIMDType xmm2( a1 * b2 );
8657 SIMDType xmm3( a2 * b1 );
8658 SIMDType xmm4( a2 * b2 );
8659 SIMDType xmm5( a3 * b1 );
8660 SIMDType xmm6( a3 * b2 );
8661 SIMDType xmm7( a4 * b1 );
8662 SIMDType xmm8( a4 * b2 );
8663
8664 for( ++k; k<kend; ++k ) {
8665 a1 = set( A(i ,k) );
8666 a2 = set( A(i+1UL,k) );
8667 a3 = set( A(i+2UL,k) );
8668 a4 = set( A(i+3UL,k) );
8669 b1 = B.load(k,j );
8670 b2 = B.load(k,j+SIMDSIZE);
8671 xmm1 += a1 * b1;
8672 xmm2 += a1 * b2;
8673 xmm3 += a2 * b1;
8674 xmm4 += a2 * b2;
8675 xmm5 += a3 * b1;
8676 xmm6 += a3 * b2;
8677 xmm7 += a4 * b1;
8678 xmm8 += a4 * b2;
8679 }
8680
8681 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8682 C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
8683 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
8684 C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
8685 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
8686 C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
8687 C.store( i+3UL, j , C.load(i+3UL,j ) - xmm7 * factor );
8688 C.store( i+3UL, j+SIMDSIZE, C.load(i+3UL,j+SIMDSIZE) - xmm8 * factor );
8689 }
8690 }
8691
8692 for( ; (i+3UL) <= iend; i+=3UL )
8693 {
8694 const size_t kbegin( ( IsUpper_v<MT4> )
8695 ?( ( IsLower_v<MT5> )
8696 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8697 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8698 :( IsLower_v<MT5> ? j : 0UL ) );
8699 const size_t kend( ( IsLower_v<MT4> )
8700 ?( ( IsUpper_v<MT5> )
8701 ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
8702 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
8703 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8704
8705 size_t k( kbegin );
8706
8707 if( k < kend )
8708 {
8709 SIMDType a1( set( A(i ,k) ) );
8710 SIMDType a2( set( A(i+1UL,k) ) );
8711 SIMDType a3( set( A(i+2UL,k) ) );
8712 SIMDType b1( B.load(k,j ) );
8713 SIMDType b2( B.load(k,j+SIMDSIZE) );
8714 SIMDType xmm1( a1 * b1 );
8715 SIMDType xmm2( a1 * b2 );
8716 SIMDType xmm3( a2 * b1 );
8717 SIMDType xmm4( a2 * b2 );
8718 SIMDType xmm5( a3 * b1 );
8719 SIMDType xmm6( a3 * b2 );
8720
8721 for( ++k; k<kend; ++k ) {
8722 a1 = set( A(i ,k) );
8723 a2 = set( A(i+1UL,k) );
8724 a3 = set( A(i+2UL,k) );
8725 b1 = B.load(k,j );
8726 b2 = B.load(k,j+SIMDSIZE);
8727 xmm1 += a1 * b1;
8728 xmm2 += a1 * b2;
8729 xmm3 += a2 * b1;
8730 xmm4 += a2 * b2;
8731 xmm5 += a3 * b1;
8732 xmm6 += a3 * b2;
8733 }
8734
8735 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8736 C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
8737 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
8738 C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
8739 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
8740 C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
8741 }
8742 }
8743
8744 for( ; (i+2UL) <= iend; i+=2UL )
8745 {
8746 const size_t kbegin( ( IsUpper_v<MT4> )
8747 ?( ( IsLower_v<MT5> )
8748 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8749 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8750 :( IsLower_v<MT5> ? j : 0UL ) );
8751 const size_t kend( ( IsLower_v<MT4> )
8752 ?( ( IsUpper_v<MT5> )
8753 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
8754 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8755 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8756
8757 size_t k( kbegin );
8758
8759 if( k < kend )
8760 {
8761 SIMDType a1( set( A(i ,k) ) );
8762 SIMDType a2( set( A(i+1UL,k) ) );
8763 SIMDType b1( B.load(k,j ) );
8764 SIMDType b2( B.load(k,j+SIMDSIZE) );
8765 SIMDType xmm1( a1 * b1 );
8766 SIMDType xmm2( a1 * b2 );
8767 SIMDType xmm3( a2 * b1 );
8768 SIMDType xmm4( a2 * b2 );
8769
8770 for( ++k; k<kend; ++k ) {
8771 a1 = set( A(i ,k) );
8772 a2 = set( A(i+1UL,k) );
8773 b1 = B.load(k,j );
8774 b2 = B.load(k,j+SIMDSIZE);
8775 xmm1 += a1 * b1;
8776 xmm2 += a1 * b2;
8777 xmm3 += a2 * b1;
8778 xmm4 += a2 * b2;
8779 }
8780
8781 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8782 C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
8783 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
8784 C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
8785 }
8786 }
8787
8788 if( i < iend )
8789 {
8790 const size_t kbegin( ( IsUpper_v<MT4> )
8791 ?( ( IsLower_v<MT5> )
8792 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8793 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8794 :( IsLower_v<MT5> ? j : 0UL ) );
8795 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
8796
8797 size_t k( kbegin );
8798
8799 if( k < kend )
8800 {
8801 SIMDType a1( set( A(i,k) ) );
8802 SIMDType xmm1( a1 * B.load(k,j ) );
8803 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE) );
8804
8805 for( ++k; k<kend; ++k ) {
8806 a1 = set( A(i,k) );
8807 xmm1 += a1 * B.load(k,j );
8808 xmm2 += a1 * B.load(k,j+SIMDSIZE);
8809 }
8810
8811 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8812 C.store( i, j+SIMDSIZE, C.load(i,j+SIMDSIZE) - xmm2 * factor );
8813 }
8814 }
8815 }
8816
8817 for( ; j<jpos; j+=SIMDSIZE )
8818 {
8819 const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
8820 size_t i( LOW ? j : 0UL );
8821
8822 for( ; (i+4UL) <= iend; i+=4UL )
8823 {
8824 const size_t kbegin( ( IsUpper_v<MT4> )
8825 ?( ( IsLower_v<MT5> )
8826 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8827 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8828 :( IsLower_v<MT5> ? j : 0UL ) );
8829 const size_t kend( ( IsLower_v<MT4> )
8830 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
8831 :( K ) );
8832
8833 size_t k( kbegin );
8834
8835 if( k < kend )
8836 {
8837 SIMDType b1( B.load(k,j) );
8838 SIMDType xmm1( set( A(i ,k) ) * b1 );
8839 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
8840 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
8841 SIMDType xmm4( set( A(i+3UL,k) ) * b1 );
8842
8843 for( ++k; k<kend; ++k ) {
8844 b1 = B.load(k,j);
8845 xmm1 += set( A(i ,k) ) * b1;
8846 xmm2 += set( A(i+1UL,k) ) * b1;
8847 xmm3 += set( A(i+2UL,k) ) * b1;
8848 xmm4 += set( A(i+3UL,k) ) * b1;
8849 }
8850
8851 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8852 C.store( i+1UL, j, C.load(i+1UL,j) - xmm2 * factor );
8853 C.store( i+2UL, j, C.load(i+2UL,j) - xmm3 * factor );
8854 C.store( i+3UL, j, C.load(i+3UL,j) - xmm4 * factor );
8855 }
8856 }
8857
8858 for( ; (i+3UL) <= iend; i+=3UL )
8859 {
8860 const size_t kbegin( ( IsUpper_v<MT4> )
8861 ?( ( IsLower_v<MT5> )
8862 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8863 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8864 :( IsLower_v<MT5> ? j : 0UL ) );
8865 const size_t kend( ( IsLower_v<MT4> )
8866 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
8867 :( K ) );
8868
8869 size_t k( kbegin );
8870
8871 if( k < kend )
8872 {
8873 SIMDType b1( B.load(k,j) );
8874 SIMDType xmm1( set( A(i ,k) ) * b1 );
8875 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
8876 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
8877
8878 for( ++k; k<kend; ++k ) {
8879 b1 = B.load(k,j);
8880 xmm1 += set( A(i ,k) ) * b1;
8881 xmm2 += set( A(i+1UL,k) ) * b1;
8882 xmm3 += set( A(i+2UL,k) ) * b1;
8883 }
8884
8885 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8886 C.store( i+1UL, j, C.load(i+1UL,j) - xmm2 * factor );
8887 C.store( i+2UL, j, C.load(i+2UL,j) - xmm3 * factor );
8888 }
8889 }
8890
8891 for( ; (i+2UL) <= iend; i+=2UL )
8892 {
8893 const size_t kbegin( ( IsUpper_v<MT4> )
8894 ?( ( IsLower_v<MT5> )
8895 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8896 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8897 :( IsLower_v<MT5> ? j : 0UL ) );
8898 const size_t kend( ( IsLower_v<MT4> )
8899 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8900 :( K ) );
8901
8902 size_t k( kbegin );
8903
8904 if( k < kend )
8905 {
8906 SIMDType b1( B.load(k,j) );
8907 SIMDType xmm1( set( A(i ,k) ) * b1 );
8908 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
8909
8910 for( ++k; k<kend; ++k ) {
8911 b1 = B.load(k,j);
8912 xmm1 += set( A(i ,k) ) * b1;
8913 xmm2 += set( A(i+1UL,k) ) * b1;
8914 }
8915
8916 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8917 C.store( i+1UL, j, C.load(i+1UL,j) - xmm2 * factor );
8918 }
8919 }
8920
8921 if( i < iend )
8922 {
8923 const size_t kbegin( ( IsUpper_v<MT4> )
8924 ?( ( IsLower_v<MT5> )
8925 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8926 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8927 :( IsLower_v<MT5> ? j : 0UL ) );
8928
8929 size_t k( kbegin );
8930
8931 if( k < K )
8932 {
8933 SIMDType xmm1( set( A(i,k) ) * B.load(k,j) );
8934
8935 for( ++k; k<K; ++k ) {
8936 xmm1 += set( A(i,k) ) * B.load(k,j);
8937 }
8938
8939 C.store( i, j, C.load(i,j) - xmm1 * factor );
8940 }
8941 }
8942 }
8943
8944 for( ; remainder && j<N; ++j )
8945 {
8946 const size_t iend( UPP ? j+1UL : M );
8947 size_t i( LOW ? j : 0UL );
8948
8949 for( ; (i+2UL) <= iend; i+=2UL )
8950 {
8951 const size_t kbegin( ( IsUpper_v<MT4> )
8952 ?( ( IsLower_v<MT5> )
8953 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8954 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8955 :( IsLower_v<MT5> ? j : 0UL ) );
8956 const size_t kend( ( IsLower_v<MT4> )
8957 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8958 :( K ) );
8959
8960 size_t k( kbegin );
8961
8962 if( k < kend )
8963 {
8964 ElementType value1( A(i ,k) * B(k,j) );
8965 ElementType value2( A(i+1UL,k) * B(k,j) );
8966
8967 for( ++k; k<kend; ++k ) {
8968 value1 += A(i ,k) * B(k,j);
8969 value2 += A(i+1UL,k) * B(k,j);
8970 }
8971
8972 C(i ,j) -= value1 * scalar;
8973 C(i+1UL,j) -= value2 * scalar;
8974 }
8975 }
8976
8977 if( i < iend )
8978 {
8979 const size_t kbegin( ( IsUpper_v<MT4> )
8980 ?( ( IsLower_v<MT5> )
8981 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8982 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8983 :( IsLower_v<MT5> ? j : 0UL ) );
8984
8985 size_t k( kbegin );
8986
8987 if( k < K )
8988 {
8989 ElementType value( A(i,k) * B(k,j) );
8990
8991 for( ++k; k<K; ++k ) {
8992 value += A(i,k) * B(k,j);
8993 }
8994
8995 C(i,j) -= value * scalar;
8996 }
8997 }
8998 }
8999 }
9000 //**********************************************************************************************
9001
9002 //**********************************************************************************************
9003 //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
9017 template< typename MT3 // Type of the left-hand side target matrix
9018 , typename MT4 // Type of the left-hand side matrix operand
9019 , typename MT5 // Type of the right-hand side matrix operand
9020 , typename ST2 > // Type of the scalar value
9021 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9022 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9023 {
9028
9029 const ForwardFunctor fwd;
9030
9031 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
9032 const OppositeType_t<MT4> tmp( serial( A ) );
9033 subAssign( C, fwd( tmp * B ) * scalar );
9034 }
9035 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
9036 const OppositeType_t<MT5> tmp( serial( B ) );
9037 subAssign( C, fwd( A * tmp ) * scalar );
9038 }
9039 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
9040 const OppositeType_t<MT4> tmp( serial( A ) );
9041 subAssign( C, fwd( tmp * B ) * scalar );
9042 }
9043 else {
9044 const OppositeType_t<MT5> tmp( serial( B ) );
9045 subAssign( C, fwd( A * tmp ) * scalar );
9046 }
9047 }
9048 //**********************************************************************************************
9049
9050 //**Default subtraction assignment to dense matrices (large matrices)***************************
9064 template< typename MT3 // Type of the left-hand side target matrix
9065 , typename MT4 // Type of the left-hand side matrix operand
9066 , typename MT5 // Type of the right-hand side matrix operand
9067 , typename ST2 > // Type of the scalar value
9068 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9069 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9070 {
9071 selectDefaultSubAssignKernel( C, A, B, scalar );
9072 }
9073 //**********************************************************************************************
9074
9075 //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
9090 template< typename MT3 // Type of the left-hand side target matrix
9091 , typename MT4 // Type of the left-hand side matrix operand
9092 , typename MT5 // Type of the right-hand side matrix operand
9093 , typename ST2 > // Type of the scalar value
9094 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9095 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9096 {
9097 if( LOW )
9098 lmmm( C, A, B, -scalar, ST2(1) );
9099 else if( UPP )
9100 ummm( C, A, B, -scalar, ST2(1) );
9101 else
9102 mmm( C, A, B, -scalar, ST2(1) );
9103 }
9104 //**********************************************************************************************
9105
9106 //**BLAS-based subtraction assignment to dense matrices (default)*******************************
9120 template< typename MT3 // Type of the left-hand side target matrix
9121 , typename MT4 // Type of the left-hand side matrix operand
9122 , typename MT5 // Type of the right-hand side matrix operand
9123 , typename ST2 > // Type of the scalar value
9124 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9125 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9126 {
9127 selectLargeSubAssignKernel( C, A, B, scalar );
9128 }
9129 //**********************************************************************************************
9130
9131 //**BLAS-based subraction assignment to dense matrices******************************************
9132#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
9146 template< typename MT3 // Type of the left-hand side target matrix
9147 , typename MT4 // Type of the left-hand side matrix operand
9148 , typename MT5 // Type of the right-hand side matrix operand
9149 , typename ST2 > // Type of the scalar value
9150 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9151 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9152 {
9153 using ET = ElementType_t<MT3>;
9154
9155 if( IsTriangular_v<MT4> ) {
9156 ResultType_t<MT3> tmp( serial( B ) );
9157 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
9158 subAssign( C, tmp );
9159 }
9160 else if( IsTriangular_v<MT5> ) {
9161 ResultType_t<MT3> tmp( serial( A ) );
9162 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
9163 subAssign( C, tmp );
9164 }
9165 else {
9166 gemm( C, A, B, ET(-scalar), ET(1) );
9167 }
9168 }
9169#endif
9170 //**********************************************************************************************
9171
9172 //**Restructuring subtraction assignment to column-major matrices*******************************
9186 template< typename MT > // Type of the target matrix
9187 friend inline auto subAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
9188 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9189 {
9191
9193
9194 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9195 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9196
9197 const ForwardFunctor fwd;
9198
9199 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9200 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9201
9202 subAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
9203 }
9204 //**********************************************************************************************
9205
9206 //**Subtraction assignment to sparse matrices***************************************************
9207 // No special implementation for the subtraction assignment to sparse matrices.
9208 //**********************************************************************************************
9209
9210 //**Schur product assignment to dense matrices**************************************************
9222 template< typename MT // Type of the target dense matrix
9223 , bool SO > // Storage order of the target dense matrix
9224 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9225 {
9227
9231
9232 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9233 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9234
9235 const ResultType tmp( serial( rhs ) );
9236 schurAssign( *lhs, tmp );
9237 }
9238 //**********************************************************************************************
9239
9240 //**Schur product assignment to sparse matrices*************************************************
9241 // No special implementation for the Schur product assignment to sparse matrices.
9242 //**********************************************************************************************
9243
9244 //**Multiplication assignment to dense matrices*************************************************
9245 // No special implementation for the multiplication assignment to dense matrices.
9246 //**********************************************************************************************
9247
9248 //**Multiplication assignment to sparse matrices************************************************
9249 // No special implementation for the multiplication assignment to sparse matrices.
9250 //**********************************************************************************************
9251
9252 //**SMP assignment to dense matrices************************************************************
9267 template< typename MT // Type of the target dense matrix
9268 , bool SO > // Storage order of the target dense matrix
9269 friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9270 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9271 {
9273
9274 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9275 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9276
9277 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9278 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9279
9280 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
9281 return;
9282 }
9283 else if( left.columns() == 0UL ) {
9284 reset( *lhs );
9285 return;
9286 }
9287
9288 LT A( left ); // Evaluation of the left-hand side dense matrix operand
9289 RT B( right ); // Evaluation of the right-hand side dense matrix operand
9290
9291 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9292 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9293 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9294 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9295 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
9296 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
9297
9298 smpAssign( *lhs, A * B * rhs.scalar_ );
9299 }
9300 //**********************************************************************************************
9301
9302 //**SMP assignment to sparse matrices***********************************************************
9317 template< typename MT // Type of the target sparse matrix
9318 , bool SO > // Storage order of the target sparse matrix
9319 friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9320 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9321 {
9323
9324 using TmpType = If_t< SO, OppositeType, ResultType >;
9325
9332
9333 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9334 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9335
9336 const ForwardFunctor fwd;
9337
9338 const TmpType tmp( rhs );
9339 smpAssign( *lhs, fwd( tmp ) );
9340 }
9341 //**********************************************************************************************
9342
9343 //**Restructuring SMP assignment to column-major matrices***************************************
9357 template< typename MT > // Type of the target matrix
9358 friend inline auto smpAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
9359 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9360 {
9362
9364
9365 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9366 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9367
9368 const ForwardFunctor fwd;
9369
9370 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9371 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9372
9373 smpAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
9374 }
9375 //**********************************************************************************************
9376
9377 //**SMP addition assignment to dense matrices***************************************************
9392 template< typename MT // Type of the target dense matrix
9393 , bool SO > // Storage order of the target dense matrix
9394 friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9395 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9396 {
9398
9399 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9400 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9401
9402 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9403 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9404
9405 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
9406 return;
9407 }
9408
9409 LT A( left ); // Evaluation of the left-hand side dense matrix operand
9410 RT B( right ); // Evaluation of the right-hand side dense matrix operand
9411
9412 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9413 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9414 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9415 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9416 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
9417 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
9418
9419 smpAddAssign( *lhs, A * B * rhs.scalar_ );
9420 }
9421 //**********************************************************************************************
9422
9423 //**Restructuring SMP addition assignment to column-major matrices******************************
9437 template< typename MT > // Type of the target matrix
9438 friend inline auto smpAddAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
9439 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9440 {
9442
9444
9445 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9446 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9447
9448 const ForwardFunctor fwd;
9449
9450 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9451 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9452
9453 smpAddAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
9454 }
9455 //**********************************************************************************************
9456
9457 //**SMP addition assignment to sparse matrices**************************************************
9458 // No special implementation for the SMP addition assignment to sparse matrices.
9459 //**********************************************************************************************
9460
9461 //**SMP subtraction assignment to dense matrices************************************************
9476 template< typename MT // Type of the target dense matrix
9477 , bool SO > // Storage order of the target dense matrix
9478 friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9479 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9480 {
9482
9483 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9484 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9485
9486 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9487 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9488
9489 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
9490 return;
9491 }
9492
9493 LT A( left ); // Evaluation of the left-hand side dense matrix operand
9494 RT B( right ); // Evaluation of the right-hand side dense matrix operand
9495
9496 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9497 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9498 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9499 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9500 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
9501 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
9502
9503 smpSubAssign( *lhs, A * B * rhs.scalar_ );
9504 }
9505 //**********************************************************************************************
9506
9507 //**Restructuring SMP subtraction assignment to column-major matrices***************************
9521 template< typename MT > // Type of the target matrix
9522 friend inline auto smpSubAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
9523 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9524 {
9526
9528
9529 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9530 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9531
9532 const ForwardFunctor fwd;
9533
9534 decltype(auto) A( transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9535 decltype(auto) B( transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9536
9537 smpSubAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
9538 }
9539 //**********************************************************************************************
9540
9541 //**SMP subtraction assignment to sparse matrices***********************************************
9542 // No special implementation for the SMP subtraction assignment to sparse matrices.
9543 //**********************************************************************************************
9544
9545 //**SMP Schur product assignment to dense matrices**********************************************
9557 template< typename MT // Type of the target dense matrix
9558 , bool SO > // Storage order of the target dense matrix
9559 friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9560 {
9562
9566
9567 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9568 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9569
9570 const ResultType tmp( rhs );
9571 smpSchurAssign( *lhs, tmp );
9572 }
9573 //**********************************************************************************************
9574
9575 //**SMP Schur product assignment to sparse matrices*********************************************
9576 // No special implementation for the SMP Schur product assignment to sparse matrices.
9577 //**********************************************************************************************
9578
9579 //**SMP multiplication assignment to dense matrices*********************************************
9580 // No special implementation for the SMP multiplication assignment to dense matrices.
9581 //**********************************************************************************************
9582
9583 //**SMP multiplication assignment to sparse matrices********************************************
9584 // No special implementation for the SMP multiplication assignment to sparse matrices.
9585 //**********************************************************************************************
9586
9587 //**Compile time checks*************************************************************************
9596 //**********************************************************************************************
9597};
9599//*************************************************************************************************
9600
9601
9602
9603
9604//=================================================================================================
9605//
9606// GLOBAL BINARY ARITHMETIC OPERATORS
9607//
9608//=================================================================================================
9609
9610//*************************************************************************************************
9637template< typename MT1 // Type of the left-hand side dense matrix
9638 , typename MT2 > // Type of the right-hand side dense matrix
9639inline decltype(auto)
9640 operator*( const DenseMatrix<MT1,false>& lhs, const DenseMatrix<MT2,false>& rhs )
9641{
9643
9644 if( (*lhs).columns() != (*rhs).rows() ) {
9645 BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
9646 }
9647
9649 return ReturnType( *lhs, *rhs );
9650}
9651//*************************************************************************************************
9652
9653
9654
9655
9656//=================================================================================================
9657//
9658// GLOBAL FUNCTIONS
9659//
9660//=================================================================================================
9661
9662//*************************************************************************************************
9685template< typename MT1 // Type of the left-hand side dense matrix
9686 , typename MT2 // Type of the right-hand side dense matrix
9687 , bool SF // Symmetry flag
9688 , bool HF // Hermitian flag
9689 , bool LF // Lower flag
9690 , bool UF > // Upper flag
9691inline decltype(auto) declsym( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9692{
9694
9695 if( !isSquare( dm ) ) {
9696 BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
9697 }
9698
9699 using ReturnType = const DMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
9700 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9701}
9703//*************************************************************************************************
9704
9705
9706//*************************************************************************************************
9729template< typename MT1 // Type of the left-hand side dense matrix
9730 , typename MT2 // Type of the right-hand side dense matrix
9731 , bool SF // Symmetry flag
9732 , bool HF // Hermitian flag
9733 , bool LF // Lower flag
9734 , bool UF > // Upper flag
9735inline decltype(auto) declherm( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9736{
9738
9739 if( !isSquare( dm ) ) {
9740 BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
9741 }
9742
9743 using ReturnType = const DMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
9744 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9745}
9747//*************************************************************************************************
9748
9749
9750//*************************************************************************************************
9773template< typename MT1 // Type of the left-hand side dense matrix
9774 , typename MT2 // Type of the right-hand side dense matrix
9775 , bool SF // Symmetry flag
9776 , bool HF // Hermitian flag
9777 , bool LF // Lower flag
9778 , bool UF > // Upper flag
9779inline decltype(auto) decllow( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9780{
9782
9783 if( !isSquare( dm ) ) {
9784 BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9785 }
9786
9787 using ReturnType = const DMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
9788 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9789}
9791//*************************************************************************************************
9792
9793
9794//*************************************************************************************************
9817template< typename MT1 // Type of the left-hand side dense matrix
9818 , typename MT2 // Type of the right-hand side dense matrix
9819 , bool SF // Symmetry flag
9820 , bool HF // Hermitian flag
9821 , bool UF > // Upper flag
9822inline decltype(auto) declunilow( const DMatDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
9823{
9825
9826 if( !isSquare( dm ) ) {
9827 BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9828 }
9829
9830 return declunilow( decllow( *dm ) );
9831}
9833//*************************************************************************************************
9834
9835
9836//*************************************************************************************************
9859template< typename MT1 // Type of the left-hand side dense matrix
9860 , typename MT2 // Type of the right-hand side dense matrix
9861 , bool SF // Symmetry flag
9862 , bool HF // Hermitian flag
9863 , bool UF > // Upper flag
9864inline decltype(auto) declstrlow( const DMatDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
9865{
9867
9868 if( !isSquare( dm ) ) {
9869 BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9870 }
9871
9872 return declstrlow( decllow( *dm ) );
9873}
9875//*************************************************************************************************
9876
9877
9878//*************************************************************************************************
9901template< typename MT1 // Type of the left-hand side dense matrix
9902 , typename MT2 // Type of the right-hand side dense matrix
9903 , bool SF // Symmetry flag
9904 , bool HF // Hermitian flag
9905 , bool LF // Lower flag
9906 , bool UF > // Upper flag
9907inline decltype(auto) declupp( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9908{
9910
9911 if( !isSquare( dm ) ) {
9912 BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
9913 }
9914
9915 using ReturnType = const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
9916 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9917}
9919//*************************************************************************************************
9920
9921
9922//*************************************************************************************************
9945template< typename MT1 // Type of the left-hand side dense matrix
9946 , typename MT2 // Type of the right-hand side dense matrix
9947 , bool SF // Symmetry flag
9948 , bool HF // Hermitian flag
9949 , bool LF > // Lower flag
9950inline decltype(auto) decluniupp( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
9951{
9953
9954 if( !isSquare( dm ) ) {
9955 BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
9956 }
9957
9958 return decluniupp( declupp( *dm ) );
9959}
9961//*************************************************************************************************
9962
9963
9964//*************************************************************************************************
9987template< typename MT1 // Type of the left-hand side dense matrix
9988 , typename MT2 // Type of the right-hand side dense matrix
9989 , bool SF // Symmetry flag
9990 , bool HF // Hermitian flag
9991 , bool LF > // Lower flag
9992inline decltype(auto) declstrupp( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
9993{
9995
9996 if( !isSquare( dm ) ) {
9997 BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
9998 }
9999
10000 return declstrlow( declupp( *dm ) );
10001}
10003//*************************************************************************************************
10004
10005
10006//*************************************************************************************************
10029template< typename MT1 // Type of the left-hand side dense matrix
10030 , typename MT2 // Type of the right-hand side dense matrix
10031 , bool SF // Symmetry flag
10032 , bool HF // Hermitian flag
10033 , bool LF // Lower flag
10034 , bool UF > // Upper flag
10035inline decltype(auto) decldiag( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
10036{
10038
10039 if( !isSquare( dm ) ) {
10040 BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
10041 }
10042
10043 using ReturnType = const DMatDMatMultExpr<MT1,MT2,SF,HF,true,true>;
10044 return ReturnType( dm.leftOperand(), dm.rightOperand() );
10045}
10047//*************************************************************************************************
10048
10049
10050
10051
10052//=================================================================================================
10053//
10054// SIZE SPECIALIZATIONS
10055//
10056//=================================================================================================
10057
10058//*************************************************************************************************
10060template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10061struct Size< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
10062 : public Size<MT1,0UL>
10063{};
10064
10065template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10066struct Size< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
10067 : public Size<MT2,1UL>
10068{};
10070//*************************************************************************************************
10071
10072
10073
10074
10075//=================================================================================================
10076//
10077// ISALIGNED SPECIALIZATIONS
10078//
10079//=================================================================================================
10080
10081//*************************************************************************************************
10083template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10084struct IsAligned< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10085 : public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
10086{};
10088//*************************************************************************************************
10089
10090} // namespace blaze
10091
10092#endif
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.
Definition: Aliases.h:310
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for kernel specific block sizes.
Header file for the blaze::checked and blaze::unchecked instances.
Constraints on the storage order of matrix types.
Header file for the complex data type.
Header file for the conjugate shim.
Header file for the decldiag trait.
Header file for the DeclDiag functor.
Header file for the declherm trait.
Header file for the DeclHerm functor.
Header file for the decllow trait.
Header file for the DeclLow functor.
Header file for the declsym trait.
Header file for the DeclSym functor.
Header file for the declupp trait.
Header file for the DeclUpp functor.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsColumnMajorMatrix type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsIntegral type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsResizable type trait.
Header file for the IsRowMajorMatrix type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Header file for the dense matrix multiplication kernels.
Header file for the multiplication trait.
Header file for the Noop functor.
Header file for the prevMultiple shim.
Constraints on the storage order of matrix types.
Header file for all SIMD functionality.
Data type constraint.
Constraint on the data type.
Constraint on the data type.
Expression object for dense matrix-dense matrix multiplications.
Definition: DMatDMatMultExpr.h:154
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:346
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:291
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatDMatMultExpr.h:309
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatDMatMultExpr.h:322
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:331
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:161
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:497
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:304
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:453
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:158
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:162
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:485
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:395
static constexpr bool UPP
Flag for upper matrices.
Definition: DMatDMatMultExpr.h:179
static constexpr bool SYM
Flag for symmetric matrices.
Definition: DMatDMatMultExpr.h:176
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:288
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: DMatDMatMultExpr.h:177
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDMatMultExpr.h:290
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:172
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:298
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:295
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:157
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:167
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:287
static constexpr bool LOW
Flag for lower matrices.
Definition: DMatDMatMultExpr.h:178
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:292
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:441
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:411
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:421
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatDMatMultExpr.h:316
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:285
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:301
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:475
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:159
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:465
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:431
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:498
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:160
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:289
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:592
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:548
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:170
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:108
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:602
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:167
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:176
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:159
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:570
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:538
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
MatScalarMultExpr< DenseMatrix< This, SO > > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:162
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:179
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:173
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:611
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:427
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:558
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:437
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:446
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:582
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:528
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:459
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:166
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:610
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:432
Base class for dense matrices.
Definition: DenseMatrix.h:82
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseMatrix base class.
Header file for the DenseVector base class.
Header file for the MatMatMultExpr base class.
Header file for the MatScalarMultExpr base class.
Header file for the SparseVector base class.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) transIf(const DenseMatrix< MT, SO > &dm)
Conditional calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:832
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1339
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1375
decltype(auto) declstrupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly upper.
Definition: DMatDeclStrUppExpr.h:1003
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1464
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:978
decltype(auto) declstrlow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly lower.
Definition: DMatDeclStrLowExpr.h:1003
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1004
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1004
decltype(auto) decluniupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as uniupper.
Definition: DMatDeclUniUppExpr.h:1005
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1005
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1005
decltype(auto) declunilow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as unilower.
Definition: DMatDeclUniLowExpr.h:1004
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: RowMajorMatrix.h:61
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.
Definition: StorageOrder.h:84
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.
Definition: MatMatMultExpr.h:103
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: ColumnMajorMatrix.h:61
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:584
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:1383
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:137
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
typename EnableIf<!Condition, T >::Type DisableIf_t
Auxiliary type for the EnableIf class template.
Definition: EnableIf.h:175
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
constexpr decltype(auto) zero(size_t m, size_t n) noexcept
Creating a zero matrix.
Definition: ZeroMatrix.h:1356
Header file for the exception macros of the math module.
Constraints on the storage order of matrix types.
Header file for all forward declarations for expression class templates.
Header file for the Size type trait.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:127
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:61
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:126
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:61
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:126
Generic wrapper for the decllow() function.
Definition: DeclLow.h:61
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:126
Generic wrapper for the declsym() function.
Definition: DeclSym.h:61
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:126
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:61
Base class for all matrix/matrix multiplication expression templates.
Definition: MatMatMultExpr.h:71
Base template for the MultTrait class.
Definition: MultTrait.h:130
Generic wrapper for the null function.
Definition: Noop.h:62
System settings for the BLAS mode.
System settings for the debugging policy of the Blaze library.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.
Header file for the generic max algorithm.
Header file for the generic min algorithm.