Blaze 3.9
TDMatDMatMultExpr.h
Go to the documentation of this file.
1//=================================================================================================
33//=================================================================================================
34
35#ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
37
38
39//*************************************************************************************************
40// Includes
41//*************************************************************************************************
42
45#include <blaze/math/Aliases.h>
70#include <blaze/math/SIMD.h>
100#include <blaze/system/BLAS.h>
107#include <blaze/util/Assert.h>
108#include <blaze/util/Complex.h>
110#include <blaze/util/EnableIf.h>
113#include <blaze/util/mpl/If.h>
114#include <blaze/util/Types.h>
122
123
124namespace blaze {
125
126//=================================================================================================
127//
128// CLASS TDMATDMATMULTEXPR
129//
130//=================================================================================================
131
132//*************************************************************************************************
139template< typename MT1 // Type of the left-hand side dense matrix
140 , typename MT2 // Type of the right-hand side dense matrix
141 , bool SF // Symmetry flag
142 , bool HF // Hermitian flag
143 , bool LF // Lower flag
144 , bool UF > // Upper flag
146 : public MatMatMultExpr< DenseMatrix< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
147 , private Computation
148{
149 private:
150 //**Type definitions****************************************************************************
157 //**********************************************************************************************
158
159 //**********************************************************************************************
161 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
162 //**********************************************************************************************
163
164 //**********************************************************************************************
166 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
167 //**********************************************************************************************
168
169 //**********************************************************************************************
170 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
171 static constexpr bool HERM = ( HF && !( LF || UF ) );
172 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
173 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
174 //**********************************************************************************************
175
176 //**********************************************************************************************
178
182 template< typename T1, typename T2, typename T3 >
183 static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
185 //**********************************************************************************************
186
187 //**********************************************************************************************
189
192 template< typename T1, typename T2, typename T3 >
193 static constexpr bool UseBlasKernel_v =
194 ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
195 !SYM && !HERM && !LOW && !UPP &&
196 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
197 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
198 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
199 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
200 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
201 IsBLASCompatible_v< ElementType_t<T1> > &&
202 IsBLASCompatible_v< ElementType_t<T2> > &&
203 IsBLASCompatible_v< ElementType_t<T3> > &&
204 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
205 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
207 //**********************************************************************************************
208
209 //**********************************************************************************************
211
214 template< typename T1, typename T2, typename T3 >
215 static constexpr bool UseVectorizedDefaultKernel_v =
216 ( useOptimizedKernels &&
217 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
218 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
219 IsSIMDCombinable_v< ElementType_t<T1>
221 , ElementType_t<T3> > &&
222 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
223 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
225 //**********************************************************************************************
226
227 //**********************************************************************************************
229
232 using ForwardFunctor = If_t< HERM
233 , DeclHerm
234 , If_t< SYM
235 , DeclSym
236 , If_t< LOW
237 , If_t< UPP
238 , DeclDiag
239 , DeclLow >
240 , If_t< UPP
241 , DeclUpp
242 , Noop > > > >;
244 //**********************************************************************************************
245
246 public:
247 //**Type definitions****************************************************************************
250
253
255 using ResultType = typename If_t< HERM
257 , If_t< SYM
259 , If_t< LOW
260 , If_t< UPP
263 , If_t< UPP
265 , MultTrait<RT1,RT2> > > > >::Type;
266
271 using ReturnType = const ElementType;
272 using CompositeType = const ResultType;
273
275 using LeftOperand = If_t< IsExpression_v<MT1>, const MT1, const MT1& >;
276
278 using RightOperand = If_t< IsExpression_v<MT2>, const MT2, const MT2& >;
279
282
285 //**********************************************************************************************
286
287 //**Compilation flags***************************************************************************
289 static constexpr bool simdEnabled =
290 ( !( IsDiagonal_v<MT1> && IsDiagonal_v<MT2> ) &&
291 MT1::simdEnabled && MT2::simdEnabled &&
292 HasSIMDAdd_v<ET1,ET2> &&
293 HasSIMDMult_v<ET1,ET2> );
294
296 static constexpr bool smpAssignable =
297 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
298 //**********************************************************************************************
299
300 //**SIMD properties*****************************************************************************
302 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
303 //**********************************************************************************************
304
305 //**Constructor*********************************************************************************
311 inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
312 : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
313 , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
314 {
315 BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
316 }
317 //**********************************************************************************************
318
319 //**Access operator*****************************************************************************
326 inline ReturnType operator()( size_t i, size_t j ) const {
327 BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
328 BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
329
330 if( IsDiagonal_v<MT1> ) {
331 return lhs_(i,i) * rhs_(i,j);
332 }
333 else if( IsDiagonal_v<MT2> ) {
334 return lhs_(i,j) * rhs_(j,j);
335 }
336 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
337 const size_t begin( ( IsUpper_v<MT1> )
338 ?( ( IsLower_v<MT2> )
339 ?( max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
340 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
341 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
342 :( ( IsLower_v<MT2> )
343 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
344 :( 0UL ) ) );
345 const size_t end( ( IsLower_v<MT1> )
346 ?( ( IsUpper_v<MT2> )
347 ?( min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
348 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
349 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
350 :( ( IsUpper_v<MT2> )
351 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
352 :( lhs_.columns() ) ) );
353
354 if( begin >= end ) return ElementType();
355
356 const size_t n( end - begin );
357
358 return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
360 }
361 else {
362 return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
363 }
364 }
365 //**********************************************************************************************
366
367 //**At function*********************************************************************************
375 inline ReturnType at( size_t i, size_t j ) const {
376 if( i >= lhs_.rows() ) {
377 BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
378 }
379 if( j >= rhs_.columns() ) {
380 BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
381 }
382 return (*this)(i,j);
383 }
384 //**********************************************************************************************
385
386 //**Rows function*******************************************************************************
391 inline size_t rows() const noexcept {
392 return lhs_.rows();
393 }
394 //**********************************************************************************************
395
396 //**Columns function****************************************************************************
401 inline size_t columns() const noexcept {
402 return rhs_.columns();
403 }
404 //**********************************************************************************************
405
406 //**Left operand access*************************************************************************
411 inline LeftOperand leftOperand() const noexcept {
412 return lhs_;
413 }
414 //**********************************************************************************************
415
416 //**Right operand access************************************************************************
421 inline RightOperand rightOperand() const noexcept {
422 return rhs_;
423 }
424 //**********************************************************************************************
425
426 //**********************************************************************************************
432 template< typename T >
433 inline bool canAlias( const T* alias ) const noexcept {
434 return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
435 }
436 //**********************************************************************************************
437
438 //**********************************************************************************************
444 template< typename T >
445 inline bool isAliased( const T* alias ) const noexcept {
446 return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
447 }
448 //**********************************************************************************************
449
450 //**********************************************************************************************
455 inline bool isAligned() const noexcept {
456 return lhs_.isAligned() && rhs_.isAligned();
457 }
458 //**********************************************************************************************
459
460 //**********************************************************************************************
465 inline bool canSMPAssign() const noexcept {
466 return ( !BLAZE_BLAS_MODE ||
467 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
469 ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
470 ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD ) &&
471 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
472 }
473 //**********************************************************************************************
474
475 private:
476 //**Member variables****************************************************************************
479 //**********************************************************************************************
480
481 //**Assignment to dense matrices****************************************************************
494 template< typename MT // Type of the target dense matrix
495 , bool SO > // Storage order of the target dense matrix
496 friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
497 {
499
500 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
501 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
502
503 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
504 return;
505 }
506 else if( rhs.lhs_.columns() == 0UL ) {
507 reset( *lhs );
508 return;
509 }
510
511 LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
512 RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
513
514 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
515 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
516 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
517 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
518 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
519 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
520
521 TDMatDMatMultExpr::selectAssignKernel( *lhs, A, B );
522 }
524 //**********************************************************************************************
525
526 //**Assignment to dense matrices (kernel selection)*********************************************
537 template< typename MT3 // Type of the left-hand side target matrix
538 , typename MT4 // Type of the left-hand side matrix operand
539 , typename MT5 > // Type of the right-hand side matrix operand
540 static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
541 {
542 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
543 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
544 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
545 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
546 selectSmallAssignKernel( C, A, B );
547 else
548 selectBlasAssignKernel( C, A, B );
549 }
551 //**********************************************************************************************
552
553 //**Default assignment to row-major dense matrices (general/general)****************************
567 template< typename MT3 // Type of the left-hand side target matrix
568 , typename MT4 // Type of the left-hand side matrix operand
569 , typename MT5 > // Type of the right-hand side matrix operand
570 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
571 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
572 {
573 const size_t M( A.rows() );
574 const size_t N( B.columns() );
575 const size_t K( A.columns() );
576
577 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
578
579 for( size_t i=0UL; i<M; ++i )
580 {
581 const size_t kbegin( ( IsUpper_v<MT4> )
582 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
583 :( 0UL ) );
584 const size_t kend( ( IsLower_v<MT4> )
585 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
586 :( K ) );
587 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
588
589 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
590 for( size_t j=0UL; j<N; ++j ) {
591 reset( C(i,j) );
592 }
593 continue;
594 }
595
596 {
597 const size_t jbegin( ( IsUpper_v<MT5> )
598 ?( ( IsStrictlyUpper_v<MT5> )
599 ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
600 :( UPP ? max(i,kbegin) : kbegin ) )
601 :( UPP ? i : 0UL ) );
602 const size_t jend( ( IsLower_v<MT5> )
603 ?( ( IsStrictlyLower_v<MT5> )
604 ?( LOW ? min(i+1UL,kbegin) : kbegin )
605 :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
606 :( LOW ? i+1UL : N ) );
607
608 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
609 for( size_t j=0UL; j<jbegin; ++j ) {
610 reset( C(i,j) );
611 }
612 }
613 else if( IsStrictlyUpper_v<MT5> ) {
614 reset( C(i,0UL) );
615 }
616 for( size_t j=jbegin; j<jend; ++j ) {
617 C(i,j) = A(i,kbegin) * B(kbegin,j);
618 }
619 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
620 for( size_t j=jend; j<N; ++j ) {
621 reset( C(i,j) );
622 }
623 }
624 else if( IsStrictlyLower_v<MT5> ) {
625 reset( C(i,N-1UL) );
626 }
627 }
628
629 for( size_t k=kbegin+1UL; k<kend; ++k )
630 {
631 const size_t jbegin( ( IsUpper_v<MT5> )
632 ?( ( IsStrictlyUpper_v<MT5> )
633 ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
634 :( SYM || HERM || UPP ? max( i, k ) : k ) )
635 :( SYM || HERM || UPP ? i : 0UL ) );
636 const size_t jend( ( IsLower_v<MT5> )
637 ?( ( IsStrictlyLower_v<MT5> )
638 ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
639 :( LOW ? min(i+1UL,k) : k ) )
640 :( LOW ? i+1UL : N ) );
641
642 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
643 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
644
645 for( size_t j=jbegin; j<jend; ++j ) {
646 C(i,j) += A(i,k) * B(k,j);
647 }
648 if( IsLower_v<MT5> ) {
649 C(i,jend) = A(i,k) * B(k,jend);
650 }
651 }
652 }
653
654 if( SYM || HERM ) {
655 for( size_t i=1UL; i<M; ++i ) {
656 for( size_t j=0UL; j<i; ++j ) {
657 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
658 }
659 }
660 }
661 }
663 //**********************************************************************************************
664
665 //**Default assignment to column-major dense matrices (general/general)*************************
679 template< typename MT3 // Type of the left-hand side target matrix
680 , typename MT4 // Type of the left-hand side matrix operand
681 , typename MT5 > // Type of the right-hand side matrix operand
682 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
683 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
684 {
685 const size_t M( A.rows() );
686 const size_t N( B.columns() );
687 const size_t K( A.columns() );
688
689 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
690
691 for( size_t j=0UL; j<N; ++j )
692 {
693 const size_t kbegin( ( IsLower_v<MT5> )
694 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
695 :( 0UL ) );
696 const size_t kend( ( IsUpper_v<MT5> )
697 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
698 :( K ) );
699 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
700
701 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
702 for( size_t i=0UL; i<M; ++i ) {
703 reset( C(i,j) );
704 }
705 continue;
706 }
707
708 {
709 const size_t ibegin( ( IsLower_v<MT4> )
710 ?( ( IsStrictlyLower_v<MT4> )
711 ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
712 :( LOW ? max(j,kbegin) : kbegin ) )
713 :( LOW ? j : 0UL ) );
714 const size_t iend( ( IsUpper_v<MT4> )
715 ?( ( IsStrictlyUpper_v<MT4> )
716 ?( UPP ? min(j+1UL,kbegin) : kbegin )
717 :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
718 :( UPP ? j+1UL : M ) );
719
720 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
721 for( size_t i=0UL; i<ibegin; ++i ) {
722 reset( C(i,j) );
723 }
724 }
725 else if( IsStrictlyLower_v<MT4> ) {
726 reset( C(0UL,j) );
727 }
728 for( size_t i=ibegin; i<iend; ++i ) {
729 C(i,j) = A(i,kbegin) * B(kbegin,j);
730 }
731 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
732 for( size_t i=iend; i<M; ++i ) {
733 reset( C(i,j) );
734 }
735 }
736 else if( IsStrictlyUpper_v<MT4> ) {
737 reset( C(M-1UL,j) );
738 }
739 }
740
741 for( size_t k=kbegin+1UL; k<kend; ++k )
742 {
743 const size_t ibegin( ( IsLower_v<MT4> )
744 ?( ( IsStrictlyLower_v<MT4> )
745 ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
746 :( SYM || HERM || LOW ? max( j, k ) : k ) )
747 :( SYM || HERM || LOW ? j : 0UL ) );
748 const size_t iend( ( IsUpper_v<MT4> )
749 ?( ( IsStrictlyUpper_v<MT4> )
750 ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
751 :( UPP ? min(j+1UL,k) : k ) )
752 :( UPP ? j+1UL : M ) );
753
754 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
755 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
756
757 for( size_t i=ibegin; i<iend; ++i ) {
758 C(i,j) += A(i,k) * B(k,j);
759 }
760 if( IsUpper_v<MT4> ) {
761 C(iend,j) = A(iend,k) * B(k,j);
762 }
763 }
764 }
765
766 if( SYM || HERM ) {
767 for( size_t j=1UL; j<N; ++j ) {
768 for( size_t i=0UL; i<j; ++i ) {
769 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
770 }
771 }
772 }
773 }
775 //**********************************************************************************************
776
777 //**Default assignment to row-major dense matrices (general/diagonal)***************************
791 template< typename MT3 // Type of the left-hand side target matrix
792 , typename MT4 // Type of the left-hand side matrix operand
793 , typename MT5 > // Type of the right-hand side matrix operand
794 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
795 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
796 {
797 constexpr size_t block( BLOCK_SIZE );
798
799 const size_t M( A.rows() );
800 const size_t N( B.columns() );
801
802 for( size_t ii=0UL; ii<M; ii+=block ) {
803 const size_t iend( min( M, ii+block ) );
804 for( size_t jj=0UL; jj<N; jj+=block ) {
805 const size_t jend( min( N, jj+block ) );
806 for( size_t i=ii; i<iend; ++i )
807 {
808 const size_t jbegin( ( IsUpper_v<MT4> )
809 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
810 :( jj ) );
811 const size_t jpos( ( IsLower_v<MT4> )
812 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
813 :( jend ) );
814
815 if( IsUpper_v<MT4> ) {
816 for( size_t j=jj; j<jbegin; ++j ) {
817 reset( C(i,j) );
818 }
819 }
820 for( size_t j=jbegin; j<jpos; ++j ) {
821 C(i,j) = A(i,j) * B(j,j);
822 }
823 if( IsLower_v<MT4> ) {
824 for( size_t j=jpos; j<jend; ++j ) {
825 reset( C(i,j) );
826 }
827 }
828 }
829 }
830 }
831 }
833 //**********************************************************************************************
834
835 //**Default assignment to column-major dense matrices (general/diagonal)************************
849 template< typename MT3 // Type of the left-hand side target matrix
850 , typename MT4 // Type of the left-hand side matrix operand
851 , typename MT5 > // Type of the right-hand side matrix operand
852 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
853 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
854 {
855 const size_t M( A.rows() );
856 const size_t N( B.columns() );
857
858 for( size_t j=0UL; j<N; ++j )
859 {
860 const size_t ibegin( ( IsLower_v<MT4> )
861 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
862 :( 0UL ) );
863 const size_t iend( ( IsUpper_v<MT4> )
864 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
865 :( M ) );
866 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
867
868 if( IsLower_v<MT4> ) {
869 for( size_t i=0UL; i<ibegin; ++i ) {
870 reset( C(i,j) );
871 }
872 }
873 for( size_t i=ibegin; i<iend; ++i ) {
874 C(i,j) = A(i,j) * B(j,j);
875 }
876 if( IsUpper_v<MT4> ) {
877 for( size_t i=iend; i<M; ++i ) {
878 reset( C(i,j) );
879 }
880 }
881 }
882 }
884 //**********************************************************************************************
885
886 //**Default assignment to row-major dense matrices (diagonal/general)***************************
900 template< typename MT3 // Type of the left-hand side target matrix
901 , typename MT4 // Type of the left-hand side matrix operand
902 , typename MT5 > // Type of the right-hand side matrix operand
903 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
904 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
905 {
906 const size_t M( A.rows() );
907 const size_t N( B.columns() );
908
909 for( size_t i=0UL; i<M; ++i )
910 {
911 const size_t jbegin( ( IsUpper_v<MT5> )
912 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
913 :( 0UL ) );
914 const size_t jend( ( IsLower_v<MT5> )
915 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
916 :( N ) );
917 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
918
919 if( IsUpper_v<MT5> ) {
920 for( size_t j=0UL; j<jbegin; ++j ) {
921 reset( C(i,j) );
922 }
923 }
924 for( size_t j=jbegin; j<jend; ++j ) {
925 C(i,j) = A(i,i) * B(i,j);
926 }
927 if( IsLower_v<MT5> ) {
928 for( size_t j=jend; j<N; ++j ) {
929 reset( C(i,j) );
930 }
931 }
932 }
933 }
935 //**********************************************************************************************
936
937 //**Default assignment to column-major dense matrices (diagonal/general)************************
951 template< typename MT3 // Type of the left-hand side target matrix
952 , typename MT4 // Type of the left-hand side matrix operand
953 , typename MT5 > // Type of the right-hand side matrix operand
954 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
955 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
956 {
957 constexpr size_t block( BLOCK_SIZE );
958
959 const size_t M( A.rows() );
960 const size_t N( B.columns() );
961
962 for( size_t jj=0UL; jj<N; jj+=block ) {
963 const size_t jend( min( N, jj+block ) );
964 for( size_t ii=0UL; ii<M; ii+=block ) {
965 const size_t iend( min( M, ii+block ) );
966 for( size_t j=jj; j<jend; ++j )
967 {
968 const size_t ibegin( ( IsLower_v<MT5> )
969 ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
970 :( ii ) );
971 const size_t ipos( ( IsUpper_v<MT5> )
972 ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
973 :( iend ) );
974
975 if( IsLower_v<MT5> ) {
976 for( size_t i=ii; i<ibegin; ++i ) {
977 reset( C(i,j) );
978 }
979 }
980 for( size_t i=ibegin; i<ipos; ++i ) {
981 C(i,j) = A(i,i) * B(i,j);
982 }
983 if( IsUpper_v<MT5> ) {
984 for( size_t i=ipos; i<iend; ++i ) {
985 reset( C(i,j) );
986 }
987 }
988 }
989 }
990 }
991 }
993 //**********************************************************************************************
994
995 //**Default assignment to dense matrices (diagonal/diagonal)************************************
1009 template< typename MT3 // Type of the left-hand side target matrix
1010 , typename MT4 // Type of the left-hand side matrix operand
1011 , typename MT5 > // Type of the right-hand side matrix operand
1012 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
1013 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1014 {
1015 reset( C );
1016
1017 for( size_t i=0UL; i<A.rows(); ++i ) {
1018 C(i,i) = A(i,i) * B(i,i);
1019 }
1020 }
1022 //**********************************************************************************************
1023
1024 //**Default assignment to dense matrices (small matrices)***************************************
1038 template< typename MT3 // Type of the left-hand side target matrix
1039 , typename MT4 // Type of the left-hand side matrix operand
1040 , typename MT5 > // Type of the right-hand side matrix operand
1041 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1042 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1043 {
1044 selectDefaultAssignKernel( C, A, B );
1045 }
1047 //**********************************************************************************************
1048
1049 //**Vectorized default assignment to row-major dense matrices (small matrices)******************
1064 template< typename MT3 // Type of the left-hand side target matrix
1065 , typename MT4 // Type of the left-hand side matrix operand
1066 , typename MT5 > // Type of the right-hand side matrix operand
1067 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1068 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1069 {
1070 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
1071
1072 const size_t M( A.rows() );
1073 const size_t N( B.columns() );
1074 const size_t K( A.columns() );
1075
1076 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1077
1078 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
1079 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
1080
1081 size_t j( 0UL );
1082
1083 if( IsIntegral_v<ElementType> )
1084 {
1085 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1086 for( size_t i=0UL; i<M; ++i )
1087 {
1088 const size_t kbegin( ( IsUpper_v<MT4> )
1089 ?( ( IsLower_v<MT5> )
1090 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1091 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1092 :( IsLower_v<MT5> ? j : 0UL ) );
1093 const size_t kend( ( IsLower_v<MT4> )
1094 ?( ( IsUpper_v<MT5> )
1095 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
1096 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
1097 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
1098
1099 size_t k( kbegin );
1100
1101 if( k < kend )
1102 {
1103 SIMDType a1( set( A(i,k) ) );
1104 SIMDType xmm1( a1 * B.load(k,j ) );
1105 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
1106 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
1107 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
1108 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
1109 SIMDType xmm6( a1 * B.load(k,j+SIMDSIZE*5UL) );
1110 SIMDType xmm7( a1 * B.load(k,j+SIMDSIZE*6UL) );
1111 SIMDType xmm8( a1 * B.load(k,j+SIMDSIZE*7UL) );
1112
1113 for( ++k; k<kend; ++k ) {
1114 a1 = set( A(i,k) );
1115 xmm1 += a1 * B.load(k,j );
1116 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1117 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1118 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1119 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1120 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
1121 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
1122 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
1123 }
1124
1125 C.store( i, j , xmm1 );
1126 C.store( i, j+SIMDSIZE , xmm2 );
1127 C.store( i, j+SIMDSIZE*2UL, xmm3 );
1128 C.store( i, j+SIMDSIZE*3UL, xmm4 );
1129 C.store( i, j+SIMDSIZE*4UL, xmm5 );
1130 C.store( i, j+SIMDSIZE*5UL, xmm6 );
1131 C.store( i, j+SIMDSIZE*6UL, xmm7 );
1132 C.store( i, j+SIMDSIZE*7UL, xmm8 );
1133 }
1134 else
1135 {
1136 const SIMDType zero;
1137 C.store( i, j , zero );
1138 C.store( i, j+SIMDSIZE , zero );
1139 C.store( i, j+SIMDSIZE*2UL, zero );
1140 C.store( i, j+SIMDSIZE*3UL, zero );
1141 C.store( i, j+SIMDSIZE*4UL, zero );
1142 C.store( i, j+SIMDSIZE*5UL, zero );
1143 C.store( i, j+SIMDSIZE*6UL, zero );
1144 C.store( i, j+SIMDSIZE*7UL, zero );
1145 }
1146 }
1147 }
1148 }
1149
1150 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
1151 {
1152 size_t i( 0UL );
1153
1154 for( ; (i+2UL) <= M; i+=2UL )
1155 {
1156 const size_t kbegin( ( IsUpper_v<MT4> )
1157 ?( ( IsLower_v<MT5> )
1158 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1159 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1160 :( IsLower_v<MT5> ? j : 0UL ) );
1161 const size_t kend( ( IsLower_v<MT4> )
1162 ?( ( IsUpper_v<MT5> )
1163 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
1164 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1165 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
1166
1167 size_t k( kbegin );
1168
1169 if( k < kend )
1170 {
1171 SIMDType a1( set( A(i ,k) ) );
1172 SIMDType a2( set( A(i+1UL,k) ) );
1173 SIMDType b1( B.load(k,j ) );
1174 SIMDType b2( B.load(k,j+SIMDSIZE ) );
1175 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1176 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1177 SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
1178 SIMDType xmm1 ( a1 * b1 );
1179 SIMDType xmm2 ( a1 * b2 );
1180 SIMDType xmm3 ( a1 * b3 );
1181 SIMDType xmm4 ( a1 * b4 );
1182 SIMDType xmm5 ( a1 * b5 );
1183 SIMDType xmm6 ( a2 * b1 );
1184 SIMDType xmm7 ( a2 * b2 );
1185 SIMDType xmm8 ( a2 * b3 );
1186 SIMDType xmm9 ( a2 * b4 );
1187 SIMDType xmm10( a2 * b5 );
1188
1189 for( ++k; k<kend; ++k ) {
1190 a1 = set( A(i ,k) );
1191 a2 = set( A(i+1UL,k) );
1192 b1 = B.load(k,j );
1193 b2 = B.load(k,j+SIMDSIZE );
1194 b3 = B.load(k,j+SIMDSIZE*2UL);
1195 b4 = B.load(k,j+SIMDSIZE*3UL);
1196 b5 = B.load(k,j+SIMDSIZE*4UL);
1197 xmm1 += a1 * b1;
1198 xmm2 += a1 * b2;
1199 xmm3 += a1 * b3;
1200 xmm4 += a1 * b4;
1201 xmm5 += a1 * b5;
1202 xmm6 += a2 * b1;
1203 xmm7 += a2 * b2;
1204 xmm8 += a2 * b3;
1205 xmm9 += a2 * b4;
1206 xmm10 += a2 * b5;
1207 }
1208
1209 C.store( i , j , xmm1 );
1210 C.store( i , j+SIMDSIZE , xmm2 );
1211 C.store( i , j+SIMDSIZE*2UL, xmm3 );
1212 C.store( i , j+SIMDSIZE*3UL, xmm4 );
1213 C.store( i , j+SIMDSIZE*4UL, xmm5 );
1214 C.store( i+1UL, j , xmm6 );
1215 C.store( i+1UL, j+SIMDSIZE , xmm7 );
1216 C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
1217 C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
1218 C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
1219 }
1220 else
1221 {
1222 const SIMDType zero;
1223 C.store( i , j , zero );
1224 C.store( i , j+SIMDSIZE , zero );
1225 C.store( i , j+SIMDSIZE*2UL, zero );
1226 C.store( i , j+SIMDSIZE*3UL, zero );
1227 C.store( i , j+SIMDSIZE*4UL, zero );
1228 C.store( i+1UL, j , zero );
1229 C.store( i+1UL, j+SIMDSIZE , zero );
1230 C.store( i+1UL, j+SIMDSIZE*2UL, zero );
1231 C.store( i+1UL, j+SIMDSIZE*3UL, zero );
1232 C.store( i+1UL, j+SIMDSIZE*4UL, zero );
1233 }
1234 }
1235
1236 if( i < M )
1237 {
1238 const size_t kbegin( ( IsUpper_v<MT4> )
1239 ?( ( IsLower_v<MT5> )
1240 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1241 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1242 :( IsLower_v<MT5> ? j : 0UL ) );
1243 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
1244
1245 size_t k( kbegin );
1246
1247 if( k < kend )
1248 {
1249 SIMDType a1( set( A(i,k) ) );
1250 SIMDType xmm1( a1 * B.load(k,j ) );
1251 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
1252 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
1253 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
1254 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
1255
1256 for( ++k; k<kend; ++k ) {
1257 a1 = set( A(i,k) );
1258 xmm1 += a1 * B.load(k,j );
1259 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1260 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1261 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1262 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1263 }
1264
1265 C.store( i, j , xmm1 );
1266 C.store( i, j+SIMDSIZE , xmm2 );
1267 C.store( i, j+SIMDSIZE*2UL, xmm3 );
1268 C.store( i, j+SIMDSIZE*3UL, xmm4 );
1269 C.store( i, j+SIMDSIZE*4UL, xmm5 );
1270 }
1271 else
1272 {
1273 const SIMDType zero;
1274 C.store( i, j , zero );
1275 C.store( i, j+SIMDSIZE , zero );
1276 C.store( i, j+SIMDSIZE*2UL, zero );
1277 C.store( i, j+SIMDSIZE*3UL, zero );
1278 C.store( i, j+SIMDSIZE*4UL, zero );
1279 }
1280 }
1281 }
1282
1283 for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1284 {
1285 const size_t iend( UPP ? min(j+SIMDSIZE*4UL,M) : M );
1286 size_t i( 0UL );
1287
1288 if( SYM || HERM ) {
1289 const size_t jjend( min(j+SIMDSIZE*4UL,N) );
1290 for( ; i<j; ++i ) {
1291 for( size_t jj=j; jj<jjend; ++jj ) {
1292 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1293 }
1294 }
1295 }
1296 else if( LOW ) {
1297 const size_t jjend( min(j+SIMDSIZE*4UL,N) );
1298 for( ; i<j; ++i ) {
1299 for( size_t jj=j; jj<jjend; ++jj ) {
1300 reset( C(i,jj) );
1301 }
1302 }
1303 }
1304
1305 for( ; (i+2UL) <= iend; i+=2UL )
1306 {
1307 const size_t kbegin( ( IsUpper_v<MT4> )
1308 ?( ( IsLower_v<MT5> )
1309 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1310 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1311 :( IsLower_v<MT5> ? j : 0UL ) );
1312 const size_t kend( ( IsLower_v<MT4> )
1313 ?( ( IsUpper_v<MT5> )
1314 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1315 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1316 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
1317
1318 size_t k( kbegin );
1319
1320 if( k < kend )
1321 {
1322 SIMDType a1( set( A(i ,k) ) );
1323 SIMDType a2( set( A(i+1UL,k) ) );
1324 SIMDType b1( B.load(k,j ) );
1325 SIMDType b2( B.load(k,j+SIMDSIZE ) );
1326 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1327 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1328 SIMDType xmm1( a1 * b1 );
1329 SIMDType xmm2( a1 * b2 );
1330 SIMDType xmm3( a1 * b3 );
1331 SIMDType xmm4( a1 * b4 );
1332 SIMDType xmm5( a2 * b1 );
1333 SIMDType xmm6( a2 * b2 );
1334 SIMDType xmm7( a2 * b3 );
1335 SIMDType xmm8( a2 * b4 );
1336
1337 for( ++k; k<kend; ++k ) {
1338 a1 = set( A(i ,k) );
1339 a2 = set( A(i+1UL,k) );
1340 b1 = B.load(k,j );
1341 b2 = B.load(k,j+SIMDSIZE );
1342 b3 = B.load(k,j+SIMDSIZE*2UL);
1343 b4 = B.load(k,j+SIMDSIZE*3UL);
1344 xmm1 += a1 * b1;
1345 xmm2 += a1 * b2;
1346 xmm3 += a1 * b3;
1347 xmm4 += a1 * b4;
1348 xmm5 += a2 * b1;
1349 xmm6 += a2 * b2;
1350 xmm7 += a2 * b3;
1351 xmm8 += a2 * b4;
1352 }
1353
1354 C.store( i , j , xmm1 );
1355 C.store( i , j+SIMDSIZE , xmm2 );
1356 C.store( i , j+SIMDSIZE*2UL, xmm3 );
1357 C.store( i , j+SIMDSIZE*3UL, xmm4 );
1358 C.store( i+1UL, j , xmm5 );
1359 C.store( i+1UL, j+SIMDSIZE , xmm6 );
1360 C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1361 C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1362 }
1363 else
1364 {
1365 SIMDType zero;
1366 C.store( i , j , zero );
1367 C.store( i , j+SIMDSIZE , zero );
1368 C.store( i , j+SIMDSIZE*2UL, zero );
1369 C.store( i , j+SIMDSIZE*3UL, zero );
1370 C.store( i+1UL, j , zero );
1371 C.store( i+1UL, j+SIMDSIZE , zero );
1372 C.store( i+1UL, j+SIMDSIZE*2UL, zero );
1373 C.store( i+1UL, j+SIMDSIZE*3UL, zero );
1374 }
1375 }
1376
1377 if( i < iend )
1378 {
1379 const size_t kbegin( ( IsUpper_v<MT4> )
1380 ?( ( IsLower_v<MT5> )
1381 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1382 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1383 :( IsLower_v<MT5> ? j : 0UL ) );
1384 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
1385
1386 size_t k( kbegin );
1387
1388 if( k < kend )
1389 {
1390 SIMDType a1( set( A(i,k) ) );
1391 SIMDType xmm1( a1 * B.load(k,j ) );
1392 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
1393 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
1394 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
1395
1396 for( ++k; k<kend; ++k ) {
1397 a1 = set( A(i,k) );
1398 xmm1 += a1 * B.load(k,j );
1399 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1400 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1401 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1402 }
1403
1404 C.store( i, j , xmm1 );
1405 C.store( i, j+SIMDSIZE , xmm2 );
1406 C.store( i, j+SIMDSIZE*2UL, xmm3 );
1407 C.store( i, j+SIMDSIZE*3UL, xmm4 );
1408 }
1409 else
1410 {
1411 const SIMDType zero;
1412 C.store( i, j , zero );
1413 C.store( i, j+SIMDSIZE , zero );
1414 C.store( i, j+SIMDSIZE*2UL, zero );
1415 C.store( i, j+SIMDSIZE*3UL, zero );
1416 }
1417
1418 if( UPP ) ++i;
1419 }
1420
1421 if( UPP ) {
1422 const size_t jjend( min(j+SIMDSIZE*4UL,N) );
1423 for( ; i<M; ++i ) {
1424 for( size_t jj=j; jj<jjend; ++jj ) {
1425 reset( C(i,jj) );
1426 }
1427 }
1428 }
1429 }
1430
1431 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1432 {
1433 const size_t iend( UPP ? min(j+SIMDSIZE*3UL,M) : M );
1434 size_t i( 0UL );
1435
1436 if( SYM || HERM ) {
1437 const size_t jjend( min(j+SIMDSIZE*3UL,N) );
1438 for( ; i<j; ++i ) {
1439 for( size_t jj=j; jj<jjend; ++jj ) {
1440 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1441 }
1442 }
1443 }
1444 else if( LOW ) {
1445 const size_t jjend( min(j+SIMDSIZE*3UL,N) );
1446 for( ; i<j; ++i ) {
1447 for( size_t jj=j; jj<jjend; ++jj ) {
1448 reset( C(i,jj) );
1449 }
1450 }
1451 }
1452
1453 for( ; (i+2UL) <= iend; i+=2UL )
1454 {
1455 const size_t kbegin( ( IsUpper_v<MT4> )
1456 ?( ( IsLower_v<MT5> )
1457 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1458 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1459 :( IsLower_v<MT5> ? j : 0UL ) );
1460 const size_t kend( ( IsLower_v<MT4> )
1461 ?( ( IsUpper_v<MT5> )
1462 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
1463 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1464 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
1465
1466 size_t k( kbegin );
1467
1468 if( k < kend )
1469 {
1470 SIMDType a1( set( A(i ,k) ) );
1471 SIMDType a2( set( A(i+1UL,k) ) );
1472 SIMDType b1( B.load(k,j ) );
1473 SIMDType b2( B.load(k,j+SIMDSIZE ) );
1474 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1475 SIMDType xmm1( a1 * b1 );
1476 SIMDType xmm2( a1 * b2 );
1477 SIMDType xmm3( a1 * b3 );
1478 SIMDType xmm4( a2 * b1 );
1479 SIMDType xmm5( a2 * b2 );
1480 SIMDType xmm6( a2 * b3 );
1481
1482 for( ++k; k<kend; ++k ) {
1483 a1 = set( A(i ,k) );
1484 a2 = set( A(i+1UL,k) );
1485 b1 = B.load(k,j );
1486 b2 = B.load(k,j+SIMDSIZE );
1487 b3 = B.load(k,j+SIMDSIZE*2UL);
1488 xmm1 += a1 * b1;
1489 xmm2 += a1 * b2;
1490 xmm3 += a1 * b3;
1491 xmm4 += a2 * b1;
1492 xmm5 += a2 * b2;
1493 xmm6 += a2 * b3;
1494 }
1495
1496 C.store( i , j , xmm1 );
1497 C.store( i , j+SIMDSIZE , xmm2 );
1498 C.store( i , j+SIMDSIZE*2UL, xmm3 );
1499 C.store( i+1UL, j , xmm4 );
1500 C.store( i+1UL, j+SIMDSIZE , xmm5 );
1501 C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1502 }
1503 else
1504 {
1505 const SIMDType zero;
1506 C.store( i , j , zero );
1507 C.store( i , j+SIMDSIZE , zero );
1508 C.store( i , j+SIMDSIZE*2UL, zero );
1509 C.store( i+1UL, j , zero );
1510 C.store( i+1UL, j+SIMDSIZE , zero );
1511 C.store( i+1UL, j+SIMDSIZE*2UL, zero );
1512 }
1513 }
1514
1515 if( i < iend )
1516 {
1517 const size_t kbegin( ( IsUpper_v<MT4> )
1518 ?( ( IsLower_v<MT5> )
1519 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1520 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1521 :( IsLower_v<MT5> ? j : 0UL ) );
1522 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
1523
1524 size_t k( kbegin );
1525
1526 if( k < kend )
1527 {
1528 SIMDType a1( set( A(i,k) ) );
1529 SIMDType xmm1( a1 * B.load(k,j ) );
1530 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
1531 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
1532
1533 for( ++k; k<kend; ++k ) {
1534 a1 = set( A(i,k) );
1535 xmm1 += a1 * B.load(k,j );
1536 xmm2 += a1 * B.load(k,j+SIMDSIZE );
1537 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1538 }
1539
1540 C.store( i, j , xmm1 );
1541 C.store( i, j+SIMDSIZE , xmm2 );
1542 C.store( i, j+SIMDSIZE*2UL, xmm3 );
1543 }
1544 else
1545 {
1546 const SIMDType zero;
1547 C.store( i, j , zero );
1548 C.store( i, j+SIMDSIZE , zero );
1549 C.store( i, j+SIMDSIZE*2UL, zero );
1550 }
1551
1552 if( UPP ) ++i;
1553 }
1554
1555 if( UPP ) {
1556 const size_t jjend( min(j+SIMDSIZE*3UL,N) );
1557 for( ; i<M; ++i ) {
1558 for( size_t jj=j; jj<jjend; ++jj ) {
1559 reset( C(i,jj) );
1560 }
1561 }
1562 }
1563 }
1564
1565 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1566 {
1567 const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
1568 size_t i( 0UL );
1569
1570 if( SYM || HERM ) {
1571 const size_t jjend( min(j+SIMDSIZE*2UL,N) );
1572 for( ; i<j; ++i ) {
1573 for( size_t jj=j; jj<jjend; ++jj ) {
1574 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1575 }
1576 }
1577 }
1578 else if( LOW ) {
1579 const size_t jjend( min(j+SIMDSIZE*2UL,N) );
1580 for( ; i<j; ++i ) {
1581 for( size_t jj=j; jj<jjend; ++jj ) {
1582 reset( C(i,jj) );
1583 }
1584 }
1585 }
1586
1587 for( ; (i+4UL) <= iend; i+=4UL )
1588 {
1589 const size_t kbegin( ( IsUpper_v<MT4> )
1590 ?( ( IsLower_v<MT5> )
1591 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1592 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1593 :( IsLower_v<MT5> ? j : 0UL ) );
1594 const size_t kend( ( IsLower_v<MT4> )
1595 ?( ( IsUpper_v<MT5> )
1596 ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
1597 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
1598 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1599
1600 size_t k( kbegin );
1601
1602 if( k < kend )
1603 {
1604 SIMDType a1( set( A(i ,k) ) );
1605 SIMDType a2( set( A(i+1UL,k) ) );
1606 SIMDType a3( set( A(i+2UL,k) ) );
1607 SIMDType a4( set( A(i+3UL,k) ) );
1608 SIMDType b1( B.load(k,j ) );
1609 SIMDType b2( B.load(k,j+SIMDSIZE) );
1610 SIMDType xmm1( a1 * b1 );
1611 SIMDType xmm2( a1 * b2 );
1612 SIMDType xmm3( a2 * b1 );
1613 SIMDType xmm4( a2 * b2 );
1614 SIMDType xmm5( a3 * b1 );
1615 SIMDType xmm6( a3 * b2 );
1616 SIMDType xmm7( a4 * b1 );
1617 SIMDType xmm8( a4 * b2 );
1618
1619 for( ++k; k<kend; ++k ) {
1620 a1 = set( A(i ,k) );
1621 a2 = set( A(i+1UL,k) );
1622 a3 = set( A(i+2UL,k) );
1623 a4 = set( A(i+3UL,k) );
1624 b1 = B.load(k,j );
1625 b2 = B.load(k,j+SIMDSIZE);
1626 xmm1 += a1 * b1;
1627 xmm2 += a1 * b2;
1628 xmm3 += a2 * b1;
1629 xmm4 += a2 * b2;
1630 xmm5 += a3 * b1;
1631 xmm6 += a3 * b2;
1632 xmm7 += a4 * b1;
1633 xmm8 += a4 * b2;
1634 }
1635
1636 C.store( i , j , xmm1 );
1637 C.store( i , j+SIMDSIZE, xmm2 );
1638 C.store( i+1UL, j , xmm3 );
1639 C.store( i+1UL, j+SIMDSIZE, xmm4 );
1640 C.store( i+2UL, j , xmm5 );
1641 C.store( i+2UL, j+SIMDSIZE, xmm6 );
1642 C.store( i+3UL, j , xmm7 );
1643 C.store( i+3UL, j+SIMDSIZE, xmm8 );
1644 }
1645 else
1646 {
1647 const SIMDType zero;
1648 C.store( i , j , zero );
1649 C.store( i , j+SIMDSIZE, zero );
1650 C.store( i+1UL, j , zero );
1651 C.store( i+1UL, j+SIMDSIZE, zero );
1652 C.store( i+2UL, j , zero );
1653 C.store( i+2UL, j+SIMDSIZE, zero );
1654 C.store( i+3UL, j , zero );
1655 C.store( i+3UL, j+SIMDSIZE, zero );
1656 }
1657 }
1658
1659 for( ; (i+3UL) <= iend; i+=3UL )
1660 {
1661 const size_t kbegin( ( IsUpper_v<MT4> )
1662 ?( ( IsLower_v<MT5> )
1663 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1664 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1665 :( IsLower_v<MT5> ? j : 0UL ) );
1666 const size_t kend( ( IsLower_v<MT4> )
1667 ?( ( IsUpper_v<MT5> )
1668 ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
1669 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
1670 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1671
1672 size_t k( kbegin );
1673
1674 if( k < kend )
1675 {
1676 SIMDType a1( set( A(i ,k) ) );
1677 SIMDType a2( set( A(i+1UL,k) ) );
1678 SIMDType a3( set( A(i+2UL,k) ) );
1679 SIMDType b1( B.load(k,j ) );
1680 SIMDType b2( B.load(k,j+SIMDSIZE) );
1681 SIMDType xmm1( a1 * b1 );
1682 SIMDType xmm2( a1 * b2 );
1683 SIMDType xmm3( a2 * b1 );
1684 SIMDType xmm4( a2 * b2 );
1685 SIMDType xmm5( a3 * b1 );
1686 SIMDType xmm6( a3 * b2 );
1687
1688 for( ++k; k<kend; ++k ) {
1689 a1 = set( A(i ,k) );
1690 a2 = set( A(i+1UL,k) );
1691 a3 = set( A(i+2UL,k) );
1692 b1 = B.load(k,j );
1693 b2 = B.load(k,j+SIMDSIZE);
1694 xmm1 += a1 * b1;
1695 xmm2 += a1 * b2;
1696 xmm3 += a2 * b1;
1697 xmm4 += a2 * b2;
1698 xmm5 += a3 * b1;
1699 xmm6 += a3 * b2;
1700 }
1701
1702 C.store( i , j , xmm1 );
1703 C.store( i , j+SIMDSIZE, xmm2 );
1704 C.store( i+1UL, j , xmm3 );
1705 C.store( i+1UL, j+SIMDSIZE, xmm4 );
1706 C.store( i+2UL, j , xmm5 );
1707 C.store( i+2UL, j+SIMDSIZE, xmm6 );
1708 }
1709 else
1710 {
1711 const SIMDType zero;
1712 C.store( i , j , zero );
1713 C.store( i , j+SIMDSIZE, zero );
1714 C.store( i+1UL, j , zero );
1715 C.store( i+1UL, j+SIMDSIZE, zero );
1716 C.store( i+2UL, j , zero );
1717 C.store( i+2UL, j+SIMDSIZE, zero );
1718 }
1719 }
1720
1721 for( ; (i+2UL) <= iend; i+=2UL )
1722 {
1723 const size_t kbegin( ( IsUpper_v<MT4> )
1724 ?( ( IsLower_v<MT5> )
1725 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1726 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1727 :( IsLower_v<MT5> ? j : 0UL ) );
1728 const size_t kend( ( IsLower_v<MT4> )
1729 ?( ( IsUpper_v<MT5> )
1730 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
1731 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1732 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1733
1734 size_t k( kbegin );
1735
1736 if( k < kend )
1737 {
1738 SIMDType a1( set( A(i ,k) ) );
1739 SIMDType a2( set( A(i+1UL,k) ) );
1740 SIMDType b1( B.load(k,j ) );
1741 SIMDType b2( B.load(k,j+SIMDSIZE) );
1742 SIMDType xmm1( a1 * b1 );
1743 SIMDType xmm2( a1 * b2 );
1744 SIMDType xmm3( a2 * b1 );
1745 SIMDType xmm4( a2 * b2 );
1746
1747 for( ++k; k<kend; ++k ) {
1748 a1 = set( A(i ,k) );
1749 a2 = set( A(i+1UL,k) );
1750 b1 = B.load(k,j );
1751 b2 = B.load(k,j+SIMDSIZE);
1752 xmm1 += a1 * b1;
1753 xmm2 += a1 * b2;
1754 xmm3 += a2 * b1;
1755 xmm4 += a2 * b2;
1756 }
1757
1758 C.store( i , j , xmm1 );
1759 C.store( i , j+SIMDSIZE, xmm2 );
1760 C.store( i+1UL, j , xmm3 );
1761 C.store( i+1UL, j+SIMDSIZE, xmm4 );
1762 }
1763 else
1764 {
1765 const SIMDType zero;
1766 C.store( i , j , zero );
1767 C.store( i , j+SIMDSIZE, zero );
1768 C.store( i+1UL, j , zero );
1769 C.store( i+1UL, j+SIMDSIZE, zero );
1770 }
1771 }
1772
1773 if( i < iend )
1774 {
1775 const size_t kbegin( ( IsUpper_v<MT4> )
1776 ?( ( IsLower_v<MT5> )
1777 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1778 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1779 :( IsLower_v<MT5> ? j : 0UL ) );
1780 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
1781
1782 size_t k( kbegin );
1783
1784 if( k < kend )
1785 {
1786 SIMDType a1( set( A(i,k) ) );
1787 SIMDType xmm1( a1 * B.load(k,j ) );
1788 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE) );
1789
1790 for( ++k; k<kend; ++k ) {
1791 a1 = set( A(i,k) );
1792 xmm1 += a1 * B.load(k,j );
1793 xmm2 += a1 * B.load(k,j+SIMDSIZE);
1794 }
1795
1796 C.store( i, j , xmm1 );
1797 C.store( i, j+SIMDSIZE, xmm2 );
1798 }
1799 else
1800 {
1801 const SIMDType zero;
1802 C.store( i, j , zero );
1803 C.store( i, j+SIMDSIZE, zero );
1804 }
1805
1806 if( UPP ) ++i;
1807 }
1808
1809 if( UPP ) {
1810 const size_t jjend( min(j+SIMDSIZE*2UL,N) );
1811 for( ; i<M; ++i ) {
1812 for( size_t jj=j; jj<jjend; ++jj ) {
1813 reset( C(i,jj) );
1814 }
1815 }
1816 }
1817 }
1818
1819 for( ; j<jpos; j+=SIMDSIZE )
1820 {
1821 const size_t iend( UPP ? min(j+SIMDSIZE,M) : M );
1822 size_t i( 0UL );
1823
1824 if( SYM || HERM ) {
1825 const size_t jjend( min(j+SIMDSIZE,N) );
1826 for( ; i<j; ++i ) {
1827 for( size_t jj=j; jj<jjend; ++jj ) {
1828 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1829 }
1830 }
1831 }
1832 else if( LOW ) {
1833 const size_t jjend( min(j+SIMDSIZE,N) );
1834 for( ; i<j; ++i ) {
1835 for( size_t jj=j; jj<jjend; ++jj ) {
1836 reset( C(i,jj) );
1837 }
1838 }
1839 }
1840
1841 for( ; (i+4UL) <= iend; i+=4UL )
1842 {
1843 const size_t kbegin( ( IsUpper_v<MT4> )
1844 ?( ( IsLower_v<MT5> )
1845 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1846 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1847 :( IsLower_v<MT5> ? j : 0UL ) );
1848 const size_t kend( ( IsLower_v<MT4> )
1849 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
1850 :( K ) );
1851
1852 size_t k( kbegin );
1853
1854 if( k < kend )
1855 {
1856 SIMDType b1( B.load(k,j) );
1857 SIMDType xmm1( set( A(i ,k) ) * b1 );
1858 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
1859 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
1860 SIMDType xmm4( set( A(i+3UL,k) ) * b1 );
1861
1862 for( ++k; k<kend; ++k ) {
1863 b1 = B.load(k,j);
1864 xmm1 += set( A(i ,k) ) * b1;
1865 xmm2 += set( A(i+1UL,k) ) * b1;
1866 xmm3 += set( A(i+2UL,k) ) * b1;
1867 xmm4 += set( A(i+3UL,k) ) * b1;
1868 }
1869
1870 C.store( i , j, xmm1 );
1871 C.store( i+1UL, j, xmm2 );
1872 C.store( i+2UL, j, xmm3 );
1873 C.store( i+3UL, j, xmm4 );
1874 }
1875 else
1876 {
1877 const SIMDType zero;
1878 C.store( i , j, zero );
1879 C.store( i+1UL, j, zero );
1880 C.store( i+2UL, j, zero );
1881 C.store( i+3UL, j, zero );
1882 }
1883 }
1884
1885 for( ; (i+3UL) <= iend; i+=3UL )
1886 {
1887 const size_t kbegin( ( IsUpper_v<MT4> )
1888 ?( ( IsLower_v<MT5> )
1889 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1890 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1891 :( IsLower_v<MT5> ? j : 0UL ) );
1892 const size_t kend( ( IsLower_v<MT4> )
1893 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
1894 :( K ) );
1895
1896 size_t k( kbegin );
1897
1898 if( k < kend )
1899 {
1900 SIMDType b1( B.load(k,j) );
1901 SIMDType xmm1( set( A(i ,k) ) * b1 );
1902 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
1903 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
1904
1905 for( ++k; k<kend; ++k ) {
1906 b1 = B.load(k,j);
1907 xmm1 += set( A(i ,k) ) * b1;
1908 xmm2 += set( A(i+1UL,k) ) * b1;
1909 xmm3 += set( A(i+2UL,k) ) * b1;
1910 }
1911
1912 C.store( i , j, xmm1 );
1913 C.store( i+1UL, j, xmm2 );
1914 C.store( i+2UL, j, xmm3 );
1915 }
1916 else
1917 {
1918 C.store( i , j, SIMDType() );
1919 C.store( i+1UL, j, SIMDType() );
1920 C.store( i+2UL, j, SIMDType() );
1921 }
1922 }
1923
1924 for( ; (i+2UL) <= iend; i+=2UL )
1925 {
1926 const size_t kbegin( ( IsUpper_v<MT4> )
1927 ?( ( IsLower_v<MT5> )
1928 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1929 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1930 :( IsLower_v<MT5> ? j : 0UL ) );
1931 const size_t kend( ( IsLower_v<MT4> )
1932 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1933 :( K ) );
1934
1935 size_t k( kbegin );
1936
1937 if( k < kend )
1938 {
1939 SIMDType b1( B.load(k,j) );
1940 SIMDType xmm1( set( A(i ,k) ) * b1 );
1941 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
1942
1943 for( ++k; k<kend; ++k ) {
1944 b1 = B.load(k,j);
1945 xmm1 += set( A(i ,k) ) * b1;
1946 xmm2 += set( A(i+1UL,k) ) * b1;
1947 }
1948
1949 C.store( i , j, xmm1 );
1950 C.store( i+1UL, j, xmm2 );
1951 }
1952 else
1953 {
1954 const SIMDType zero;
1955 C.store( i , j, zero );
1956 C.store( i+1UL, j, zero );
1957 }
1958 }
1959
1960 if( i < iend )
1961 {
1962 const size_t kbegin( ( IsUpper_v<MT4> )
1963 ?( ( IsLower_v<MT5> )
1964 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1965 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1966 :( IsLower_v<MT5> ? j : 0UL ) );
1967
1968 size_t k( kbegin );
1969
1970 if( k < K )
1971 {
1972 SIMDType xmm1( set( A(i,k) ) * B.load(k,j) );
1973
1974 for( ++k; k<K; ++k ) {
1975 xmm1 += set( A(i,k) ) * B.load(k,j);
1976 }
1977
1978 C.store( i, j, xmm1 );
1979 }
1980 else
1981 {
1982 const SIMDType zero;
1983 C.store( i, j, zero );
1984 }
1985
1986 if( UPP ) ++i;
1987 }
1988
1989 if( UPP ) {
1990 const size_t jjend( min(j+SIMDSIZE,N) );
1991 for( ; i<M; ++i ) {
1992 for( size_t jj=j; jj<jjend; ++jj ) {
1993 reset( C(i,jj) );
1994 }
1995 }
1996 }
1997 }
1998
1999 for( ; remainder && j<N; ++j )
2000 {
2001 size_t i( 0UL );
2002
2003 if( SYM || HERM ) {
2004 for( ; i<j; ++i ) {
2005 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
2006 }
2007 }
2008 else if( LOW ) {
2009 for( ; i<j; ++i ) {
2010 reset( C(i,j) );
2011 }
2012 }
2013
2014 for( ; (i+2UL) <= M; i+=2UL )
2015 {
2016 const size_t kbegin( ( IsUpper_v<MT4> )
2017 ?( ( IsLower_v<MT5> )
2018 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2019 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2020 :( IsLower_v<MT5> ? j : 0UL ) );
2021 const size_t kend( ( IsLower_v<MT4> )
2022 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
2023 :( K ) );
2024
2025 size_t k( kbegin );
2026
2027 if( k < kend )
2028 {
2029 ElementType value1( A(i ,k) * B(k,j) );
2030 ElementType value2( A(i+1UL,k) * B(k,j) );
2031
2032 for( ++k; k<kend; ++k ) {
2033 value1 += A(i ,k) * B(k,j);
2034 value2 += A(i+1UL,k) * B(k,j);
2035 }
2036
2037 C(i ,j) = value1;
2038 C(i+1UL,j) = value2;
2039 }
2040 else
2041 {
2042 reset( C(i ,j) );
2043 reset( C(i+1UL,j) );
2044 }
2045 }
2046
2047 if( i < M )
2048 {
2049 const size_t kbegin( ( IsUpper_v<MT4> )
2050 ?( ( IsLower_v<MT5> )
2051 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2052 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2053 :( IsLower_v<MT5> ? j : 0UL ) );
2054
2055 size_t k( kbegin );
2056
2057 if( k < K )
2058 {
2059 ElementType value( A(i,k) * B(k,j) );
2060
2061 for( ++k; k<K; ++k ) {
2062 value += A(i,k) * B(k,j);
2063 }
2064
2065 C(i,j) = value;
2066 }
2067 else
2068 {
2069 reset( C(i,j) );
2070 }
2071 }
2072 }
2073 }
2075 //**********************************************************************************************
2076
2077 //**Vectorized default assignment to column-major dense matrices (small matrices)***************
2092 template< typename MT3 // Type of the left-hand side target matrix
2093 , typename MT4 // Type of the left-hand side matrix operand
2094 , typename MT5 > // Type of the right-hand side matrix operand
2095 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
2096 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2097 {
2098 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
2099
2100 const size_t M( A.rows() );
2101 const size_t N( B.columns() );
2102 const size_t K( A.columns() );
2103
2104 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2105
2106 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
2107 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
2108
2109 size_t i( 0UL );
2110
2111 if( IsIntegral_v<ElementType> )
2112 {
2113 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
2114 for( size_t j=0UL; j<N; ++j )
2115 {
2116 const size_t kbegin( ( IsLower_v<MT5> )
2117 ?( ( IsUpper_v<MT4> )
2118 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2119 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2120 :( IsUpper_v<MT4> ? i : 0UL ) );
2121 const size_t kend( ( IsUpper_v<MT5> )
2122 ?( ( IsLower_v<MT4> )
2123 ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2124 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
2125 :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
2126
2127 size_t k( kbegin );
2128
2129 if( k < kend )
2130 {
2131 SIMDType b1( set( B(k,j) ) );
2132 SIMDType xmm1( A.load(i ,k) * b1 );
2133 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
2134 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
2135 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
2136 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
2137 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,k) * b1 );
2138 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,k) * b1 );
2139 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,k) * b1 );
2140
2141 for( ++k; k<kend; ++k ) {
2142 b1 = set( B(k,j) );
2143 xmm1 += A.load(i ,k) * b1;
2144 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2145 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2146 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2147 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2148 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
2149 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
2150 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
2151 }
2152
2153 C.store( i , j, xmm1 );
2154 C.store( i+SIMDSIZE , j, xmm2 );
2155 C.store( i+SIMDSIZE*2UL, j, xmm3 );
2156 C.store( i+SIMDSIZE*3UL, j, xmm4 );
2157 C.store( i+SIMDSIZE*4UL, j, xmm5 );
2158 C.store( i+SIMDSIZE*5UL, j, xmm6 );
2159 C.store( i+SIMDSIZE*6UL, j, xmm7 );
2160 C.store( i+SIMDSIZE*7UL, j, xmm8 );
2161 }
2162 else
2163 {
2164 const SIMDType zero;
2165 C.store( i , j, zero );
2166 C.store( i+SIMDSIZE , j, zero );
2167 C.store( i+SIMDSIZE*2UL, j, zero );
2168 C.store( i+SIMDSIZE*3UL, j, zero );
2169 C.store( i+SIMDSIZE*4UL, j, zero );
2170 C.store( i+SIMDSIZE*5UL, j, zero );
2171 C.store( i+SIMDSIZE*6UL, j, zero );
2172 C.store( i+SIMDSIZE*7UL, j, zero );
2173 }
2174 }
2175 }
2176 }
2177
2178 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
2179 {
2180 size_t j( 0UL );
2181
2182 for( ; (j+2UL) <= N; j+=2UL )
2183 {
2184 const size_t kbegin( ( IsLower_v<MT5> )
2185 ?( ( IsUpper_v<MT4> )
2186 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2187 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2188 :( IsUpper_v<MT4> ? i : 0UL ) );
2189 const size_t kend( ( IsUpper_v<MT5> )
2190 ?( ( IsLower_v<MT4> )
2191 ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2192 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2193 :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
2194
2195 size_t k( kbegin );
2196
2197 if( k < kend )
2198 {
2199 SIMDType a1( A.load(i ,k) );
2200 SIMDType a2( A.load(i+SIMDSIZE ,k) );
2201 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2202 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2203 SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
2204 SIMDType b1( set( B(k,j ) ) );
2205 SIMDType b2( set( B(k,j+1UL) ) );
2206 SIMDType xmm1 ( a1 * b1 );
2207 SIMDType xmm2 ( a2 * b1 );
2208 SIMDType xmm3 ( a3 * b1 );
2209 SIMDType xmm4 ( a4 * b1 );
2210 SIMDType xmm5 ( a5 * b1 );
2211 SIMDType xmm6 ( a1 * b2 );
2212 SIMDType xmm7 ( a2 * b2 );
2213 SIMDType xmm8 ( a3 * b2 );
2214 SIMDType xmm9 ( a4 * b2 );
2215 SIMDType xmm10( a5 * b2 );
2216
2217 for( ++k; k<kend; ++k ) {
2218 a1 = A.load(i ,k);
2219 a2 = A.load(i+SIMDSIZE ,k);
2220 a3 = A.load(i+SIMDSIZE*2UL,k);
2221 a4 = A.load(i+SIMDSIZE*3UL,k);
2222 a5 = A.load(i+SIMDSIZE*4UL,k);
2223 b1 = set( B(k,j ) );
2224 b2 = set( B(k,j+1UL) );
2225 xmm1 += a1 * b1;
2226 xmm2 += a2 * b1;
2227 xmm3 += a3 * b1;
2228 xmm4 += a4 * b1;
2229 xmm5 += a5 * b1;
2230 xmm6 += a1 * b2;
2231 xmm7 += a2 * b2;
2232 xmm8 += a3 * b2;
2233 xmm9 += a4 * b2;
2234 xmm10 += a5 * b2;
2235 }
2236
2237 C.store( i , j , xmm1 );
2238 C.store( i+SIMDSIZE , j , xmm2 );
2239 C.store( i+SIMDSIZE*2UL, j , xmm3 );
2240 C.store( i+SIMDSIZE*3UL, j , xmm4 );
2241 C.store( i+SIMDSIZE*4UL, j , xmm5 );
2242 C.store( i , j+1UL, xmm6 );
2243 C.store( i+SIMDSIZE , j+1UL, xmm7 );
2244 C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
2245 C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
2246 C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
2247 }
2248 else
2249 {
2250 const SIMDType zero;
2251 C.store( i , j , zero );
2252 C.store( i+SIMDSIZE , j , zero );
2253 C.store( i+SIMDSIZE*2UL, j , zero );
2254 C.store( i+SIMDSIZE*3UL, j , zero );
2255 C.store( i+SIMDSIZE*4UL, j , zero );
2256 C.store( i , j+1UL, zero );
2257 C.store( i+SIMDSIZE , j+1UL, zero );
2258 C.store( i+SIMDSIZE*2UL, j+1UL, zero );
2259 C.store( i+SIMDSIZE*3UL, j+1UL, zero );
2260 C.store( i+SIMDSIZE*4UL, j+1UL, zero );
2261 }
2262 }
2263
2264 if( j < N )
2265 {
2266 const size_t kbegin( ( IsLower_v<MT5> )
2267 ?( ( IsUpper_v<MT4> )
2268 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2269 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2270 :( IsUpper_v<MT4> ? i : 0UL ) );
2271 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
2272
2273 size_t k( kbegin );
2274
2275 if( k < kend )
2276 {
2277 SIMDType b1( set( B(k,j) ) );
2278 SIMDType xmm1( A.load(i ,k) * b1 );
2279 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
2280 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
2281 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
2282 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
2283
2284 for( ++k; k<kend; ++k ) {
2285 b1 = set( B(k,j) );
2286 xmm1 += A.load(i ,k) * b1;
2287 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2288 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2289 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2290 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2291 }
2292
2293 C.store( i , j, xmm1 );
2294 C.store( i+SIMDSIZE , j, xmm2 );
2295 C.store( i+SIMDSIZE*2UL, j, xmm3 );
2296 C.store( i+SIMDSIZE*3UL, j, xmm4 );
2297 C.store( i+SIMDSIZE*4UL, j, xmm5 );
2298 }
2299 else
2300 {
2301 const SIMDType zero;
2302 C.store( i , j, zero );
2303 C.store( i+SIMDSIZE , j, zero );
2304 C.store( i+SIMDSIZE*2UL, j, zero );
2305 C.store( i+SIMDSIZE*3UL, j, zero );
2306 C.store( i+SIMDSIZE*4UL, j, zero );
2307 }
2308 }
2309 }
2310
2311 for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2312 {
2313 const size_t jend( LOW ? min(i+SIMDSIZE*4UL,N) : N );
2314 size_t j( 0UL );
2315
2316 if( SYM || HERM ) {
2317 const size_t iiend( min(i+SIMDSIZE*4UL,M) );
2318 for( ; j<i; ++j ) {
2319 for( size_t ii=i; ii<iiend; ++ii ) {
2320 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
2321 }
2322 }
2323 }
2324 else if( UPP ) {
2325 const size_t iiend( min(i+SIMDSIZE*4UL,M) );
2326 for( ; j<i; ++j ) {
2327 for( size_t ii=i; ii<iiend; ++ii ) {
2328 reset( C(ii,j) );
2329 }
2330 }
2331 }
2332
2333 for( ; (j+2UL) <= jend; j+=2UL )
2334 {
2335 const size_t kbegin( ( IsLower_v<MT5> )
2336 ?( ( IsUpper_v<MT4> )
2337 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2338 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2339 :( IsUpper_v<MT4> ? i : 0UL ) );
2340 const size_t kend( ( IsUpper_v<MT5> )
2341 ?( ( IsLower_v<MT4> )
2342 ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2343 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2344 :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
2345
2346 size_t k( kbegin );
2347
2348 if( k < kend )
2349 {
2350 SIMDType a1( A.load(i ,k) );
2351 SIMDType a2( A.load(i+SIMDSIZE ,k) );
2352 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2353 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2354 SIMDType b1( set( B(k,j ) ) );
2355 SIMDType b2( set( B(k,j+1UL) ) );
2356 SIMDType xmm1( a1 * b1 );
2357 SIMDType xmm2( a2 * b1 );
2358 SIMDType xmm3( a3 * b1 );
2359 SIMDType xmm4( a4 * b1 );
2360 SIMDType xmm5( a1 * b2 );
2361 SIMDType xmm6( a2 * b2 );
2362 SIMDType xmm7( a3 * b2 );
2363 SIMDType xmm8( a4 * b2 );
2364
2365 for( ++k; k<kend; ++k ) {
2366 a1 = A.load(i ,k);
2367 a2 = A.load(i+SIMDSIZE ,k);
2368 a3 = A.load(i+SIMDSIZE*2UL,k);
2369 a4 = A.load(i+SIMDSIZE*3UL,k);
2370 b1 = set( B(k,j ) );
2371 b2 = set( B(k,j+1UL) );
2372 xmm1 += a1 * b1;
2373 xmm2 += a2 * b1;
2374 xmm3 += a3 * b1;
2375 xmm4 += a4 * b1;
2376 xmm5 += a1 * b2;
2377 xmm6 += a2 * b2;
2378 xmm7 += a3 * b2;
2379 xmm8 += a4 * b2;
2380 }
2381
2382 C.store( i , j , xmm1 );
2383 C.store( i+SIMDSIZE , j , xmm2 );
2384 C.store( i+SIMDSIZE*2UL, j , xmm3 );
2385 C.store( i+SIMDSIZE*3UL, j , xmm4 );
2386 C.store( i , j+1UL, xmm5 );
2387 C.store( i+SIMDSIZE , j+1UL, xmm6 );
2388 C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
2389 C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
2390 }
2391 else
2392 {
2393 const SIMDType zero;
2394 C.store( i , j , zero );
2395 C.store( i+SIMDSIZE , j , zero );
2396 C.store( i+SIMDSIZE*2UL, j , zero );
2397 C.store( i+SIMDSIZE*3UL, j , zero );
2398 C.store( i , j+1UL, zero );
2399 C.store( i+SIMDSIZE , j+1UL, zero );
2400 C.store( i+SIMDSIZE*2UL, j+1UL, zero );
2401 C.store( i+SIMDSIZE*3UL, j+1UL, zero );
2402 }
2403 }
2404
2405 if( j < jend )
2406 {
2407 const size_t kbegin( ( IsLower_v<MT5> )
2408 ?( ( IsUpper_v<MT4> )
2409 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2410 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2411 :( IsUpper_v<MT4> ? i : 0UL ) );
2412 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
2413
2414 size_t k( kbegin );
2415
2416 if( k < kend )
2417 {
2418 SIMDType b1( set( B(k,j) ) );
2419 SIMDType xmm1( A.load(i ,k) * b1 );
2420 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
2421 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
2422 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
2423
2424 for( ++k; k<kend; ++k ) {
2425 b1 = set( B(k,j) );
2426 xmm1 += A.load(i ,k) * b1;
2427 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2428 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2429 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2430 }
2431
2432 C.store( i , j, xmm1 );
2433 C.store( i+SIMDSIZE , j, xmm2 );
2434 C.store( i+SIMDSIZE*2UL, j, xmm3 );
2435 C.store( i+SIMDSIZE*3UL, j, xmm4 );
2436 }
2437 else
2438 {
2439 const SIMDType zero;
2440 C.store( i , j, zero );
2441 C.store( i+SIMDSIZE , j, zero );
2442 C.store( i+SIMDSIZE*2UL, j, zero );
2443 C.store( i+SIMDSIZE*3UL, j, zero );
2444 }
2445
2446 if( LOW ) ++j;
2447 }
2448
2449 if( LOW ) {
2450 const size_t iiend( min(i+SIMDSIZE*4UL,M) );
2451 for( ; j<N; ++j ) {
2452 for( size_t ii=i; ii<iiend; ++ii ) {
2453 reset( C(ii,j) );
2454 }
2455 }
2456 }
2457 }
2458
2459 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2460 {
2461 const size_t jend( LOW ? min(i+SIMDSIZE*3UL,N) : N );
2462 size_t j( 0UL );
2463
2464 if( SYM || HERM ) {
2465 const size_t iiend( min(i+SIMDSIZE*3UL,M) );
2466 for( ; j<i; ++j ) {
2467 for( size_t ii=i; ii<iiend; ++ii ) {
2468 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
2469 }
2470 }
2471 }
2472 else if( UPP ) {
2473 const size_t iiend( min(i+SIMDSIZE*3UL,M) );
2474 for( ; j<i; ++j ) {
2475 for( size_t ii=i; ii<iiend; ++ii ) {
2476 reset( C(ii,j) );
2477 }
2478 }
2479 }
2480
2481 for( ; (j+2UL) <= jend; j+=2UL )
2482 {
2483 const size_t kbegin( ( IsLower_v<MT5> )
2484 ?( ( IsUpper_v<MT4> )
2485 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2486 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2487 :( IsUpper_v<MT4> ? i : 0UL ) );
2488 const size_t kend( ( IsUpper_v<MT5> )
2489 ?( ( IsLower_v<MT4> )
2490 ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2491 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2492 :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
2493
2494 size_t k( kbegin );
2495
2496 if( k < kend )
2497 {
2498 SIMDType a1( A.load(i ,k) );
2499 SIMDType a2( A.load(i+SIMDSIZE ,k) );
2500 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2501 SIMDType b1( set( B(k,j ) ) );
2502 SIMDType b2( set( B(k,j+1UL) ) );
2503 SIMDType xmm1( a1 * b1 );
2504 SIMDType xmm2( a2 * b1 );
2505 SIMDType xmm3( a3 * b1 );
2506 SIMDType xmm4( a1 * b2 );
2507 SIMDType xmm5( a2 * b2 );
2508 SIMDType xmm6( a3 * b2 );
2509
2510 for( ++k; k<kend; ++k ) {
2511 a1 = A.load(i ,k);
2512 a2 = A.load(i+SIMDSIZE ,k);
2513 a3 = A.load(i+SIMDSIZE*2UL,k);
2514 b1 = set( B(k,j ) );
2515 b2 = set( B(k,j+1UL) );
2516 xmm1 += a1 * b1;
2517 xmm2 += a2 * b1;
2518 xmm3 += a3 * b1;
2519 xmm4 += a1 * b2;
2520 xmm5 += a2 * b2;
2521 xmm6 += a3 * b2;
2522 }
2523
2524 C.store( i , j , xmm1 );
2525 C.store( i+SIMDSIZE , j , xmm2 );
2526 C.store( i+SIMDSIZE*2UL, j , xmm3 );
2527 C.store( i , j+1UL, xmm4 );
2528 C.store( i+SIMDSIZE , j+1UL, xmm5 );
2529 C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2530 }
2531 else
2532 {
2533 const SIMDType zero;
2534 C.store( i , j , zero );
2535 C.store( i+SIMDSIZE , j , zero );
2536 C.store( i+SIMDSIZE*2UL, j , zero );
2537 C.store( i , j+1UL, zero );
2538 C.store( i+SIMDSIZE , j+1UL, zero );
2539 C.store( i+SIMDSIZE*2UL, j+1UL, zero );
2540 }
2541 }
2542
2543 if( j < jend )
2544 {
2545 const size_t kbegin( ( IsLower_v<MT5> )
2546 ?( ( IsUpper_v<MT4> )
2547 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2548 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2549 :( IsUpper_v<MT4> ? i : 0UL ) );
2550 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
2551
2552 size_t k( kbegin );
2553
2554 if( k < kend )
2555 {
2556 SIMDType b1( set( B(k,j) ) );
2557 SIMDType xmm1( A.load(i ,k) * b1 );
2558 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
2559 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
2560
2561 for( ++k; k<kend; ++k ) {
2562 b1 = set( B(k,j) );
2563 xmm1 += A.load(i ,k) * b1;
2564 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2565 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2566 }
2567
2568 C.store( i , j, xmm1 );
2569 C.store( i+SIMDSIZE , j, xmm2 );
2570 C.store( i+SIMDSIZE*2UL, j, xmm3 );
2571 }
2572 else
2573 {
2574 const SIMDType zero;
2575 C.store( i , j, zero );
2576 C.store( i+SIMDSIZE , j, zero );
2577 C.store( i+SIMDSIZE*2UL, j, zero );
2578 }
2579
2580 if( LOW ) ++j;
2581 }
2582
2583 if( LOW ) {
2584 const size_t iiend( min(i+SIMDSIZE*3UL,M) );
2585 for( ; j<N; ++j ) {
2586 for( size_t ii=i; ii<iiend; ++ii ) {
2587 reset( C(ii,j) );
2588 }
2589 }
2590 }
2591 }
2592
2593 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2594 {
2595 const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
2596 size_t j( 0UL );
2597
2598 if( SYM || HERM ) {
2599 const size_t iiend( min(i+SIMDSIZE*2UL,M) );
2600 for( ; j<i; ++j ) {
2601 for( size_t ii=i; ii<iiend; ++ii ) {
2602 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
2603 }
2604 }
2605 }
2606 else if( UPP ) {
2607 const size_t iiend( min(i+SIMDSIZE*2UL,M) );
2608 for( ; j<i; ++j ) {
2609 for( size_t ii=i; ii<iiend; ++ii ) {
2610 reset( C(ii,j) );
2611 }
2612 }
2613 }
2614
2615 for( ; (j+4UL) <= jend; j+=4UL )
2616 {
2617 const size_t kbegin( ( IsLower_v<MT5> )
2618 ?( ( IsUpper_v<MT4> )
2619 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2620 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2621 :( IsUpper_v<MT4> ? i : 0UL ) );
2622 const size_t kend( ( IsUpper_v<MT5> )
2623 ?( ( IsLower_v<MT4> )
2624 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
2625 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
2626 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2627
2628 size_t k( kbegin );
2629
2630 if( k < kend )
2631 {
2632 SIMDType a1( A.load(i ,k) );
2633 SIMDType a2( A.load(i+SIMDSIZE,k) );
2634 SIMDType b1( set( B(k,j ) ) );
2635 SIMDType b2( set( B(k,j+1UL) ) );
2636 SIMDType b3( set( B(k,j+2UL) ) );
2637 SIMDType b4( set( B(k,j+3UL) ) );
2638 SIMDType xmm1( a1 * b1 );
2639 SIMDType xmm2( a2 * b1 );
2640 SIMDType xmm3( a1 * b2 );
2641 SIMDType xmm4( a2 * b2 );
2642 SIMDType xmm5( a1 * b3 );
2643 SIMDType xmm6( a2 * b3 );
2644 SIMDType xmm7( a1 * b4 );
2645 SIMDType xmm8( a2 * b4 );
2646
2647 for( ++k; k<kend; ++k ) {
2648 a1 = A.load(i ,k);
2649 a2 = A.load(i+SIMDSIZE,k);
2650 b1 = set( B(k,j ) );
2651 b2 = set( B(k,j+1UL) );
2652 b3 = set( B(k,j+2UL) );
2653 b4 = set( B(k,j+3UL) );
2654 xmm1 += a1 * b1;
2655 xmm2 += a2 * b1;
2656 xmm3 += a1 * b2;
2657 xmm4 += a2 * b2;
2658 xmm5 += a1 * b3;
2659 xmm6 += a2 * b3;
2660 xmm7 += a1 * b4;
2661 xmm8 += a2 * b4;
2662 }
2663
2664 C.store( i , j , xmm1 );
2665 C.store( i+SIMDSIZE, j , xmm2 );
2666 C.store( i , j+1UL, xmm3 );
2667 C.store( i+SIMDSIZE, j+1UL, xmm4 );
2668 C.store( i , j+2UL, xmm5 );
2669 C.store( i+SIMDSIZE, j+2UL, xmm6 );
2670 C.store( i , j+3UL, xmm7 );
2671 C.store( i+SIMDSIZE, j+3UL, xmm8 );
2672 }
2673 else
2674 {
2675 const SIMDType zero;
2676 C.store( i , j , zero );
2677 C.store( i+SIMDSIZE, j , zero );
2678 C.store( i , j+1UL, zero );
2679 C.store( i+SIMDSIZE, j+1UL, zero );
2680 C.store( i , j+2UL, zero );
2681 C.store( i+SIMDSIZE, j+2UL, zero );
2682 C.store( i , j+3UL, zero );
2683 C.store( i+SIMDSIZE, j+3UL, zero );
2684 }
2685 }
2686
2687 for( ; (j+3UL) <= jend; j+=3UL )
2688 {
2689 const size_t kbegin( ( IsLower_v<MT5> )
2690 ?( ( IsUpper_v<MT4> )
2691 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2692 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2693 :( IsUpper_v<MT4> ? i : 0UL ) );
2694 const size_t kend( ( IsUpper_v<MT5> )
2695 ?( ( IsLower_v<MT4> )
2696 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
2697 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
2698 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2699
2700 size_t k( kbegin );
2701
2702 if( k < kend )
2703 {
2704 SIMDType a1( A.load(i ,k) );
2705 SIMDType a2( A.load(i+SIMDSIZE,k) );
2706 SIMDType b1( set( B(k,j ) ) );
2707 SIMDType b2( set( B(k,j+1UL) ) );
2708 SIMDType b3( set( B(k,j+2UL) ) );
2709 SIMDType xmm1( a1 * b1 );
2710 SIMDType xmm2( a2 * b1 );
2711 SIMDType xmm3( a1 * b2 );
2712 SIMDType xmm4( a2 * b2 );
2713 SIMDType xmm5( a1 * b3 );
2714 SIMDType xmm6( a2 * b3 );
2715
2716 for( ++k; k<kend; ++k ) {
2717 a1 = A.load(i ,k);
2718 a2 = A.load(i+SIMDSIZE,k);
2719 b1 = set( B(k,j ) );
2720 b2 = set( B(k,j+1UL) );
2721 b3 = set( B(k,j+2UL) );
2722 xmm1 += a1 * b1;
2723 xmm2 += a2 * b1;
2724 xmm3 += a1 * b2;
2725 xmm4 += a2 * b2;
2726 xmm5 += a1 * b3;
2727 xmm6 += a2 * b3;
2728 }
2729
2730 C.store( i , j , xmm1 );
2731 C.store( i+SIMDSIZE, j , xmm2 );
2732 C.store( i , j+1UL, xmm3 );
2733 C.store( i+SIMDSIZE, j+1UL, xmm4 );
2734 C.store( i , j+2UL, xmm5 );
2735 C.store( i+SIMDSIZE, j+2UL, xmm6 );
2736 }
2737 else
2738 {
2739 const SIMDType zero;
2740 C.store( i , j , zero );
2741 C.store( i+SIMDSIZE, j , zero );
2742 C.store( i , j+1UL, zero );
2743 C.store( i+SIMDSIZE, j+1UL, zero );
2744 C.store( i , j+2UL, zero );
2745 C.store( i+SIMDSIZE, j+2UL, zero );
2746 }
2747 }
2748
2749 for( ; (j+2UL) <= jend; j+=2UL )
2750 {
2751 const size_t kbegin( ( IsLower_v<MT5> )
2752 ?( ( IsUpper_v<MT4> )
2753 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2754 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2755 :( IsUpper_v<MT4> ? i : 0UL ) );
2756 const size_t kend( ( IsUpper_v<MT5> )
2757 ?( ( IsLower_v<MT4> )
2758 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2759 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2760 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2761
2762 size_t k( kbegin );
2763
2764 if( k < kend )
2765 {
2766 SIMDType a1( A.load(i ,k) );
2767 SIMDType a2( A.load(i+SIMDSIZE,k) );
2768 SIMDType b1( set( B(k,j ) ) );
2769 SIMDType b2( set( B(k,j+1UL) ) );
2770 SIMDType xmm1( a1 * b1 );
2771 SIMDType xmm2( a2 * b1 );
2772 SIMDType xmm3( a1 * b2 );
2773 SIMDType xmm4( a2 * b2 );
2774
2775 for( ++k; k<kend; ++k ) {
2776 a1 = A.load(i ,k);
2777 a2 = A.load(i+SIMDSIZE,k);
2778 b1 = set( B(k,j ) );
2779 b2 = set( B(k,j+1UL) );
2780 xmm1 += a1 * b1;
2781 xmm2 += a2 * b1;
2782 xmm3 += a1 * b2;
2783 xmm4 += a2 * b2;
2784 }
2785
2786 C.store( i , j , xmm1 );
2787 C.store( i+SIMDSIZE, j , xmm2 );
2788 C.store( i , j+1UL, xmm3 );
2789 C.store( i+SIMDSIZE, j+1UL, xmm4 );
2790 }
2791 else
2792 {
2793 const SIMDType zero;
2794 C.store( i , j , zero );
2795 C.store( i+SIMDSIZE, j , zero );
2796 C.store( i , j+1UL, zero );
2797 C.store( i+SIMDSIZE, j+1UL, zero );
2798 }
2799 }
2800
2801 if( j < jend )
2802 {
2803 const size_t kbegin( ( IsLower_v<MT5> )
2804 ?( ( IsUpper_v<MT4> )
2805 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2806 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2807 :( IsUpper_v<MT4> ? i : 0UL ) );
2808 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
2809
2810 size_t k( kbegin );
2811
2812 if( k < kend )
2813 {
2814 SIMDType b1( set( B(k,j) ) );
2815 SIMDType xmm1( A.load(i ,k) * b1 );
2816 SIMDType xmm2( A.load(i+SIMDSIZE,k) * b1 );
2817
2818 for( ++k; k<kend; ++k ) {
2819 b1 = set( B(k,j) );
2820 xmm1 += A.load(i ,k) * b1;
2821 xmm2 += A.load(i+SIMDSIZE,k) * b1;
2822 }
2823
2824 C.store( i , j, xmm1 );
2825 C.store( i+SIMDSIZE, j, xmm2 );
2826 }
2827 else
2828 {
2829 const SIMDType zero;
2830 C.store( i , j, zero );
2831 C.store( i+SIMDSIZE, j, zero );
2832 }
2833
2834 if( LOW ) ++j;
2835 }
2836
2837 if( LOW ) {
2838 const size_t iiend( min(i+SIMDSIZE*2UL,M) );
2839 for( ; j<N; ++j ) {
2840 for( size_t ii=i; ii<iiend; ++ii ) {
2841 reset( C(ii,j) );
2842 }
2843 }
2844 }
2845 }
2846
2847 for( ; i<ipos; i+=SIMDSIZE )
2848 {
2849 const size_t jend( LOW ? min(i+SIMDSIZE,N) : N );
2850 size_t j( 0UL );
2851
2852 if( SYM || HERM ) {
2853 const size_t iiend( min(i+SIMDSIZE,M) );
2854 for( ; j<i; ++j ) {
2855 for( size_t ii=i; ii<iiend; ++ii ) {
2856 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
2857 }
2858 }
2859 }
2860 else if( UPP ) {
2861 const size_t iiend( min(i+SIMDSIZE,M) );
2862 for( ; j<i; ++j ) {
2863 for( size_t ii=i; ii<iiend; ++ii ) {
2864 reset( C(ii,j) );
2865 }
2866 }
2867 }
2868
2869 for( ; (j+4UL) <= jend; j+=4UL )
2870 {
2871 const size_t kbegin( ( IsLower_v<MT5> )
2872 ?( ( IsUpper_v<MT4> )
2873 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2874 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2875 :( IsUpper_v<MT4> ? i : 0UL ) );
2876 const size_t kend( ( IsUpper_v<MT5> )
2877 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
2878 :( K ) );
2879
2880 size_t k( kbegin );
2881
2882 if( k < kend )
2883 {
2884 SIMDType a1( A.load(i,k) );
2885 SIMDType xmm1( a1 * set( B(k,j ) ) );
2886 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
2887 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
2888 SIMDType xmm4( a1 * set( B(k,j+3UL) ) );
2889
2890 for( ++k; k<kend; ++k ) {
2891 a1 = A.load(i,k);
2892 xmm1 += a1 * set( B(k,j ) );
2893 xmm2 += a1 * set( B(k,j+1UL) );
2894 xmm3 += a1 * set( B(k,j+2UL) );
2895 xmm4 += a1 * set( B(k,j+3UL) );
2896 }
2897
2898 C.store( i, j , xmm1 );
2899 C.store( i, j+1UL, xmm2 );
2900 C.store( i, j+2UL, xmm3 );
2901 C.store( i, j+3UL, xmm4 );
2902 }
2903 else
2904 {
2905 const SIMDType zero;
2906 C.store( i, j , zero );
2907 C.store( i, j+1UL, zero );
2908 C.store( i, j+2UL, zero );
2909 C.store( i, j+3UL, zero );
2910 }
2911 }
2912
2913 for( ; (j+3UL) <= jend; j+=3UL )
2914 {
2915 const size_t kbegin( ( IsLower_v<MT5> )
2916 ?( ( IsUpper_v<MT4> )
2917 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2918 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2919 :( IsUpper_v<MT4> ? i : 0UL ) );
2920 const size_t kend( ( IsUpper_v<MT5> )
2921 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
2922 :( K ) );
2923
2924 size_t k( kbegin );
2925
2926 if( k < kend )
2927 {
2928 SIMDType a1( A.load(i,k) );
2929 SIMDType xmm1( a1 * set( B(k,j ) ) );
2930 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
2931 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
2932
2933 for( ++k; k<kend; ++k ) {
2934 a1 = A.load(i,k);
2935 xmm1 += a1 * set( B(k,j ) );
2936 xmm2 += a1 * set( B(k,j+1UL) );
2937 xmm3 += a1 * set( B(k,j+2UL) );
2938 }
2939
2940 C.store( i, j , xmm1 );
2941 C.store( i, j+1UL, xmm2 );
2942 C.store( i, j+2UL, xmm3 );
2943 }
2944 else
2945 {
2946 const SIMDType zero;
2947 C.store( i, j , zero );
2948 C.store( i, j+1UL, zero );
2949 C.store( i, j+2UL, zero );
2950 }
2951 }
2952
2953 for( ; (j+2UL) <= jend; j+=2UL )
2954 {
2955 const size_t kbegin( ( IsLower_v<MT5> )
2956 ?( ( IsUpper_v<MT4> )
2957 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2958 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2959 :( IsUpper_v<MT4> ? i : 0UL ) );
2960 const size_t kend( ( IsUpper_v<MT5> )
2961 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2962 :( K ) );
2963
2964 size_t k( kbegin );
2965
2966 if( k < kend )
2967 {
2968 SIMDType a1( A.load(i,k) );
2969 SIMDType xmm1( a1 * set( B(k,j ) ) );
2970 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
2971
2972 for( ++k; k<kend; ++k ) {
2973 a1 = A.load(i,k);
2974 xmm1 += a1 * set( B(k,j ) );
2975 xmm2 += a1 * set( B(k,j+1UL) );
2976 }
2977
2978 C.store( i, j , xmm1 );
2979 C.store( i, j+1UL, xmm2 );
2980 }
2981 else
2982 {
2983 const SIMDType zero;
2984 C.store( i, j , zero );
2985 C.store( i, j+1UL, zero );
2986 }
2987 }
2988
2989 if( j < jend )
2990 {
2991 const size_t kbegin( ( IsLower_v<MT5> )
2992 ?( ( IsUpper_v<MT4> )
2993 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2994 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2995 :( IsUpper_v<MT4> ? i : 0UL ) );
2996
2997 size_t k( kbegin );
2998
2999 if( k < K )
3000 {
3001 SIMDType xmm1( A.load(i,k) * set( B(k,j) ) );
3002
3003 for( ++k; k<K; ++k ) {
3004 xmm1 += A.load(i,k) * set( B(k,j) );
3005 }
3006
3007 C.store( i, j, xmm1 );
3008 }
3009 else
3010 {
3011 const SIMDType zero;
3012 C.store( i, j, zero );
3013 }
3014
3015 if( LOW ) ++j;
3016 }
3017
3018 if( LOW ) {
3019 const size_t iiend( min(i+SIMDSIZE,M) );
3020 for( ; j<N; ++j ) {
3021 for( size_t ii=i; ii<iiend; ++ii ) {
3022 reset( C(ii,j) );
3023 }
3024 }
3025 }
3026 }
3027
3028 for( ; remainder && i<M; ++i )
3029 {
3030 size_t j( 0UL );
3031
3032 if( SYM || HERM ) {
3033 for( ; j<i; ++j ) {
3034 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
3035 }
3036 }
3037 else if( UPP ) {
3038 for( ; j<i; ++j ) {
3039 reset( C(i,j) );
3040 }
3041 }
3042
3043 for( ; (j+2UL) <= N; j+=2UL )
3044 {
3045 const size_t kbegin( ( IsLower_v<MT5> )
3046 ?( ( IsUpper_v<MT4> )
3047 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3048 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3049 :( IsUpper_v<MT4> ? i : 0UL ) );
3050 const size_t kend( ( IsUpper_v<MT5> )
3051 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
3052 :( K ) );
3053
3054 size_t k( kbegin );
3055
3056 if( k < kend )
3057 {
3058 ElementType value1( A(i,k) * B(k,j ) );
3059 ElementType value2( A(i,k) * B(k,j+1UL) );
3060
3061 for( ++k; k<kend; ++k ) {
3062 value1 += A(i,k) * B(k,j );
3063 value2 += A(i,k) * B(k,j+1UL);
3064 }
3065
3066 C(i,j ) = value1;
3067 C(i,j+1UL) = value2;
3068 }
3069 else
3070 {
3071 reset( C(i,j ) );
3072 reset( C(i,j+1UL) );
3073 }
3074 }
3075
3076 if( j < N )
3077 {
3078 const size_t kbegin( ( IsLower_v<MT5> )
3079 ?( ( IsUpper_v<MT4> )
3080 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3081 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3082 :( IsUpper_v<MT4> ? i : 0UL ) );
3083
3084 size_t k( kbegin );
3085
3086 if( k < K )
3087 {
3088 ElementType value( A(i,k) * B(k,j) );
3089
3090 for( ++k; k<K; ++k ) {
3091 value += A(i,k) * B(k,j);
3092 }
3093
3094 C(i,j) = value;
3095 }
3096 else
3097 {
3098 reset( C(i,j) );
3099 }
3100 }
3101 }
3102 }
3104 //**********************************************************************************************
3105
3106 //**Default assignment to dense matrices (large matrices)***************************************
3120 template< typename MT3 // Type of the left-hand side target matrix
3121 , typename MT4 // Type of the left-hand side matrix operand
3122 , typename MT5 > // Type of the right-hand side matrix operand
3123 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
3124 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3125 {
3126 selectDefaultAssignKernel( C, A, B );
3127 }
3129 //**********************************************************************************************
3130
3131 //**Vectorized default assignment to dense matrices (large matrices)****************************
3146 template< typename MT3 // Type of the left-hand side target matrix
3147 , typename MT4 // Type of the left-hand side matrix operand
3148 , typename MT5 > // Type of the right-hand side matrix operand
3149 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
3150 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3151 {
3152 if( SYM )
3153 smmm( C, A, B, ElementType(1) );
3154 else if( HERM )
3155 hmmm( C, A, B, ElementType(1) );
3156 else if( LOW )
3157 lmmm( C, A, B, ElementType(1), ElementType(0) );
3158 else if( UPP )
3159 ummm( C, A, B, ElementType(1), ElementType(0) );
3160 else
3161 mmm( C, A, B, ElementType(1), ElementType(0) );
3162 }
3164 //**********************************************************************************************
3165
3166 //**BLAS-based assignment to dense matrices (default)*******************************************
3180 template< typename MT3 // Type of the left-hand side target matrix
3181 , typename MT4 // Type of the left-hand side matrix operand
3182 , typename MT5 > // Type of the right-hand side matrix operand
3183 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
3184 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3185 {
3186 selectLargeAssignKernel( C, A, B );
3187 }
3189 //**********************************************************************************************
3190
3191 //**BLAS-based assignment to dense matrices*****************************************************
3192#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3206 template< typename MT3 // Type of the left-hand side target matrix
3207 , typename MT4 // Type of the left-hand side matrix operand
3208 , typename MT5 > // Type of the right-hand side matrix operand
3209 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
3210 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3211 {
3212 using ET = ElementType_t<MT3>;
3213
3214 if( IsTriangular_v<MT4> ) {
3215 assign( C, B );
3216 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
3217 }
3218 else if( IsTriangular_v<MT5> ) {
3219 assign( C, A );
3220 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
3221 }
3222 else {
3223 gemm( C, A, B, ET(1), ET(0) );
3224 }
3225 }
3227#endif
3228 //**********************************************************************************************
3229
3230 //**Assignment to sparse matrices***************************************************************
3243 template< typename MT // Type of the target sparse matrix
3244 , bool SO > // Storage order of the target sparse matrix
3245 friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
3246 {
3248
3249 using TmpType = If_t< SO, ResultType, OppositeType >;
3250
3257
3258 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
3259 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
3260
3261 const ForwardFunctor fwd;
3262
3263 const TmpType tmp( serial( rhs ) );
3264 assign( *lhs, fwd( tmp ) );
3265 }
3267 //**********************************************************************************************
3268
3269 //**Addition assignment to dense matrices*******************************************************
3282 template< typename MT // Type of the target dense matrix
3283 , bool SO > // Storage order of the target dense matrix
3284 friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
3285 {
3287
3288 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
3289 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
3290
3291 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3292 return;
3293 }
3294
3295 LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3296 RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3297
3298 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3299 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3300 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3301 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3302 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
3303 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
3304
3305 TDMatDMatMultExpr::selectAddAssignKernel( *lhs, A, B );
3306 }
3308 //**********************************************************************************************
3309
3310 //**Addition assignment to dense matrices (kernel selection)************************************
3321 template< typename MT3 // Type of the left-hand side target matrix
3322 , typename MT4 // Type of the left-hand side matrix operand
3323 , typename MT5 > // Type of the right-hand side matrix operand
3324 static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3325 {
3326 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
3327 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
3328 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
3329 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
3330 selectSmallAddAssignKernel( C, A, B );
3331 else
3332 selectBlasAddAssignKernel( C, A, B );
3333 }
3335 //**********************************************************************************************
3336
3337 //**Default addition assignment to row-major dense matrices (general/general)*******************
3351 template< typename MT3 // Type of the left-hand side target matrix
3352 , typename MT4 // Type of the left-hand side matrix operand
3353 , typename MT5 > // Type of the right-hand side matrix operand
3354 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3355 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3356 {
3357 const size_t M( A.rows() );
3358 const size_t N( B.columns() );
3359 const size_t K( A.columns() );
3360
3361 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3362
3363 for( size_t i=0UL; i<M; ++i )
3364 {
3365 const size_t kbegin( ( IsUpper_v<MT4> )
3366 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3367 :( 0UL ) );
3368 const size_t kend( ( IsLower_v<MT4> )
3369 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3370 :( K ) );
3371 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3372
3373 for( size_t k=kbegin; k<kend; ++k )
3374 {
3375 const size_t jbegin( ( IsUpper_v<MT5> )
3376 ?( ( IsStrictlyUpper_v<MT5> )
3377 ?( UPP ? max(i,k+1UL) : k+1UL )
3378 :( UPP ? max(i,k) : k ) )
3379 :( UPP ? i : 0UL ) );
3380 const size_t jend( ( IsLower_v<MT5> )
3381 ?( ( IsStrictlyLower_v<MT5> )
3382 ?( LOW ? min(i+1UL,k) : k )
3383 :( LOW ? min(i,k)+1UL : k+1UL ) )
3384 :( LOW ? i+1UL : N ) );
3385
3386 if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
3387 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3388
3389 const size_t jnum( jend - jbegin );
3390 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
3391 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
3392
3393 for( size_t j=jbegin; j<jpos; j+=2UL ) {
3394 C(i,j ) += A(i,k) * B(k,j );
3395 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
3396 }
3397 if( jpos < jend ) {
3398 C(i,jpos) += A(i,k) * B(k,jpos);
3399 }
3400 }
3401 }
3402 }
3404 //**********************************************************************************************
3405
3406 //**Default addition assignment to column-major dense matrices (general/general)****************
3420 template< typename MT3 // Type of the left-hand side target matrix
3421 , typename MT4 // Type of the left-hand side matrix operand
3422 , typename MT5 > // Type of the right-hand side matrix operand
3423 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3424 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3425 {
3426 const size_t M( A.rows() );
3427 const size_t N( B.columns() );
3428 const size_t K( A.columns() );
3429
3430 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3431
3432 for( size_t j=0UL; j<N; ++j )
3433 {
3434 const size_t kbegin( ( IsLower_v<MT5> )
3435 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3436 :( 0UL ) );
3437 const size_t kend( ( IsUpper_v<MT5> )
3438 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3439 :( K ) );
3440 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3441
3442 for( size_t k=kbegin; k<kend; ++k )
3443 {
3444 const size_t ibegin( ( IsLower_v<MT4> )
3445 ?( ( IsStrictlyLower_v<MT4> )
3446 ?( LOW ? max(j,k+1UL) : k+1UL )
3447 :( LOW ? max(j,k) : k ) )
3448 :( LOW ? j : 0UL ) );
3449 const size_t iend( ( IsUpper_v<MT4> )
3450 ?( ( IsStrictlyUpper_v<MT4> )
3451 ?( UPP ? min(j+1UL,k) : k )
3452 :( UPP ? min(j,k)+1UL : k+1UL ) )
3453 :( UPP ? j+1UL : M ) );
3454
3455 if( ( LOW || UPP ) && ibegin >= iend ) continue;
3456 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3457
3458 const size_t inum( iend - ibegin );
3459 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
3460 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
3461
3462 for( size_t i=ibegin; i<ipos; i+=2UL ) {
3463 C(i ,j) += A(i ,k) * B(k,j);
3464 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
3465 }
3466 if( ipos < iend ) {
3467 C(ipos,j) += A(ipos,k) * B(k,j);
3468 }
3469 }
3470 }
3471 }
3473 //**********************************************************************************************
3474
3475 //**Default addition assignment to row-major dense matrices (general/diagonal)******************
3489 template< typename MT3 // Type of the left-hand side target matrix
3490 , typename MT4 // Type of the left-hand side matrix operand
3491 , typename MT5 > // Type of the right-hand side matrix operand
3492 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3493 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3494 {
3495 constexpr size_t block( BLOCK_SIZE );
3496
3497 const size_t M( A.rows() );
3498 const size_t N( B.columns() );
3499
3500 for( size_t ii=0UL; ii<M; ii+=block ) {
3501 const size_t iend( min( M, ii+block ) );
3502 for( size_t jj=0UL; jj<N; jj+=block ) {
3503 const size_t jend( min( N, jj+block ) );
3504 for( size_t i=ii; i<iend; ++i )
3505 {
3506 const size_t jbegin( ( IsUpper_v<MT4> )
3507 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
3508 :( jj ) );
3509 const size_t jpos( ( IsLower_v<MT4> )
3510 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
3511 :( jend ) );
3512
3513 for( size_t j=jbegin; j<jpos; ++j ) {
3514 C(i,j) += A(i,j) * B(j,j);
3515 }
3516 }
3517 }
3518 }
3519 }
3521 //**********************************************************************************************
3522
3523 //**Default addition assignment to column-major dense matrices (general/diagonal)***************
3537 template< typename MT3 // Type of the left-hand side target matrix
3538 , typename MT4 // Type of the left-hand side matrix operand
3539 , typename MT5 > // Type of the right-hand side matrix operand
3540 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3541 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3542 {
3543 const size_t M( A.rows() );
3544 const size_t N( B.columns() );
3545
3546 for( size_t j=0UL; j<N; ++j )
3547 {
3548 const size_t ibegin( ( IsLower_v<MT4> )
3549 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
3550 :( 0UL ) );
3551 const size_t iend( ( IsUpper_v<MT4> )
3552 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
3553 :( M ) );
3554 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3555
3556 const size_t inum( iend - ibegin );
3557 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
3558 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
3559
3560 for( size_t i=ibegin; i<ipos; i+=2UL ) {
3561 C(i ,j) += A(i ,j) * B(j,j);
3562 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
3563 }
3564 if( ipos < iend ) {
3565 C(ipos,j) += A(ipos,j) * B(j,j);
3566 }
3567 }
3568 }
3570 //**********************************************************************************************
3571
3572 //**Default addition assignment to row-major dense matrices (diagonal/general)******************
3586 template< typename MT3 // Type of the left-hand side target matrix
3587 , typename MT4 // Type of the left-hand side matrix operand
3588 , typename MT5 > // Type of the right-hand side matrix operand
3589 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3590 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3591 {
3592 const size_t M( A.rows() );
3593 const size_t N( B.columns() );
3594
3595 for( size_t i=0UL; i<M; ++i )
3596 {
3597 const size_t jbegin( ( IsUpper_v<MT5> )
3598 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
3599 :( 0UL ) );
3600 const size_t jend( ( IsLower_v<MT5> )
3601 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
3602 :( N ) );
3603 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3604
3605 const size_t jnum( jend - jbegin );
3606 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
3607 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
3608
3609 for( size_t j=jbegin; j<jpos; j+=2UL ) {
3610 C(i,j ) += A(i,i) * B(i,j );
3611 C(i,j+1UL) += A(i,i) * B(i,j+1UL);
3612 }
3613 if( jpos < jend ) {
3614 C(i,jpos) += A(i,i) * B(i,jpos);
3615 }
3616 }
3617 }
3619 //**********************************************************************************************
3620
3621 //**Default addition assignment to column-major dense matrices (diagonal/general)***************
3635 template< typename MT3 // Type of the left-hand side target matrix
3636 , typename MT4 // Type of the left-hand side matrix operand
3637 , typename MT5 > // Type of the right-hand side matrix operand
3638 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3639 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3640 {
3641 constexpr size_t block( BLOCK_SIZE );
3642
3643 const size_t M( A.rows() );
3644 const size_t N( B.columns() );
3645
3646 for( size_t jj=0UL; jj<N; jj+=block ) {
3647 const size_t jend( min( N, jj+block ) );
3648 for( size_t ii=0UL; ii<M; ii+=block ) {
3649 const size_t iend( min( M, ii+block ) );
3650 for( size_t j=jj; j<jend; ++j )
3651 {
3652 const size_t ibegin( ( IsLower_v<MT5> )
3653 ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
3654 :( ii ) );
3655 const size_t ipos( ( IsUpper_v<MT5> )
3656 ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
3657 :( iend ) );
3658
3659 for( size_t i=ibegin; i<ipos; ++i ) {
3660 C(i,j) += A(i,i) * B(i,j);
3661 }
3662 }
3663 }
3664 }
3665 }
3667 //**********************************************************************************************
3668
3669 //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
3683 template< typename MT3 // Type of the left-hand side target matrix
3684 , typename MT4 // Type of the left-hand side matrix operand
3685 , typename MT5 > // Type of the right-hand side matrix operand
3686 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3687 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3688 {
3689 for( size_t i=0UL; i<A.rows(); ++i ) {
3690 C(i,i) += A(i,i) * B(i,i);
3691 }
3692 }
3694 //**********************************************************************************************
3695
3696 //**Default addition assignment to dense matrices (small matrices)******************************
3710 template< typename MT3 // Type of the left-hand side target matrix
3711 , typename MT4 // Type of the left-hand side matrix operand
3712 , typename MT5 > // Type of the right-hand side matrix operand
3713 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3714 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3715 {
3716 selectDefaultAddAssignKernel( C, A, B );
3717 }
3719 //**********************************************************************************************
3720
3721 //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
3736 template< typename MT3 // Type of the left-hand side target matrix
3737 , typename MT4 // Type of the left-hand side matrix operand
3738 , typename MT5 > // Type of the right-hand side matrix operand
3739 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3740 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3741 {
3742 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
3743
3744 const size_t M( A.rows() );
3745 const size_t N( B.columns() );
3746 const size_t K( A.columns() );
3747
3748 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3749
3750 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
3751 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
3752
3753 size_t j( 0UL );
3754
3755 if( IsIntegral_v<ElementType> )
3756 {
3757 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
3758 for( size_t i=0UL; i<M; ++i )
3759 {
3760 const size_t kbegin( ( IsUpper_v<MT4> )
3761 ?( ( IsLower_v<MT5> )
3762 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3763 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3764 :( IsLower_v<MT5> ? j : 0UL ) );
3765 const size_t kend( ( IsLower_v<MT4> )
3766 ?( ( IsUpper_v<MT5> )
3767 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
3768 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3769 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
3770
3771 SIMDType xmm1( C.load(i,j ) );
3772 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3773 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3774 SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3775 SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
3776 SIMDType xmm6( C.load(i,j+SIMDSIZE*5UL) );
3777 SIMDType xmm7( C.load(i,j+SIMDSIZE*6UL) );
3778 SIMDType xmm8( C.load(i,j+SIMDSIZE*7UL) );
3779
3780 for( size_t k=kbegin; k<kend; ++k ) {
3781 const SIMDType a1( set( A(i,k) ) );
3782 xmm1 += a1 * B.load(k,j );
3783 xmm2 += a1 * B.load(k,j+SIMDSIZE );
3784 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3785 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3786 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3787 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
3788 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
3789 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
3790 }
3791
3792 C.store( i, j , xmm1 );
3793 C.store( i, j+SIMDSIZE , xmm2 );
3794 C.store( i, j+SIMDSIZE*2UL, xmm3 );
3795 C.store( i, j+SIMDSIZE*3UL, xmm4 );
3796 C.store( i, j+SIMDSIZE*4UL, xmm5 );
3797 C.store( i, j+SIMDSIZE*5UL, xmm6 );
3798 C.store( i, j+SIMDSIZE*6UL, xmm7 );
3799 C.store( i, j+SIMDSIZE*7UL, xmm8 );
3800 }
3801 }
3802 }
3803
3804 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
3805 {
3806 size_t i( 0UL );
3807
3808 for( ; (i+2UL) <= M; i+=2UL )
3809 {
3810 const size_t kbegin( ( IsUpper_v<MT4> )
3811 ?( ( IsLower_v<MT5> )
3812 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3813 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3814 :( IsLower_v<MT5> ? j : 0UL ) );
3815 const size_t kend( ( IsLower_v<MT4> )
3816 ?( ( IsUpper_v<MT5> )
3817 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
3818 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3819 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
3820
3821 SIMDType xmm1 ( C.load(i ,j ) );
3822 SIMDType xmm2 ( C.load(i ,j+SIMDSIZE ) );
3823 SIMDType xmm3 ( C.load(i ,j+SIMDSIZE*2UL) );
3824 SIMDType xmm4 ( C.load(i ,j+SIMDSIZE*3UL) );
3825 SIMDType xmm5 ( C.load(i ,j+SIMDSIZE*4UL) );
3826 SIMDType xmm6 ( C.load(i+1UL,j ) );
3827 SIMDType xmm7 ( C.load(i+1UL,j+SIMDSIZE ) );
3828 SIMDType xmm8 ( C.load(i+1UL,j+SIMDSIZE*2UL) );
3829 SIMDType xmm9 ( C.load(i+1UL,j+SIMDSIZE*3UL) );
3830 SIMDType xmm10( C.load(i+1UL,j+SIMDSIZE*4UL) );
3831
3832 for( size_t k=kbegin; k<kend; ++k ) {
3833 const SIMDType a1( set( A(i ,k) ) );
3834 const SIMDType a2( set( A(i+1UL,k) ) );
3835 const SIMDType b1( B.load(k,j ) );
3836 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3837 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3838 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3839 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
3840 xmm1 += a1 * b1;
3841 xmm2 += a1 * b2;
3842 xmm3 += a1 * b3;
3843 xmm4 += a1 * b4;
3844 xmm5 += a1 * b5;
3845 xmm6 += a2 * b1;
3846 xmm7 += a2 * b2;
3847 xmm8 += a2 * b3;
3848 xmm9 += a2 * b4;
3849 xmm10 += a2 * b5;
3850 }
3851
3852 C.store( i , j , xmm1 );
3853 C.store( i , j+SIMDSIZE , xmm2 );
3854 C.store( i , j+SIMDSIZE*2UL, xmm3 );
3855 C.store( i , j+SIMDSIZE*3UL, xmm4 );
3856 C.store( i , j+SIMDSIZE*4UL, xmm5 );
3857 C.store( i+1UL, j , xmm6 );
3858 C.store( i+1UL, j+SIMDSIZE , xmm7 );
3859 C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
3860 C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3861 C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3862 }
3863
3864 if( i < M )
3865 {
3866 const size_t kbegin( ( IsUpper_v<MT4> )
3867 ?( ( IsLower_v<MT5> )
3868 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3869 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3870 :( IsLower_v<MT5> ? j : 0UL ) );
3871 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
3872
3873 SIMDType xmm1( C.load(i,j ) );
3874 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3875 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3876 SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3877 SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
3878
3879 for( size_t k=kbegin; k<kend; ++k ) {
3880 const SIMDType a1( set( A(i,k) ) );
3881 xmm1 += a1 * B.load(k,j );
3882 xmm2 += a1 * B.load(k,j+SIMDSIZE );
3883 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3884 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3885 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3886 }
3887
3888 C.store( i, j , xmm1 );
3889 C.store( i, j+SIMDSIZE , xmm2 );
3890 C.store( i, j+SIMDSIZE*2UL, xmm3 );
3891 C.store( i, j+SIMDSIZE*3UL, xmm4 );
3892 C.store( i, j+SIMDSIZE*4UL, xmm5 );
3893 }
3894 }
3895
3896 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3897 {
3898 size_t i( 0UL );
3899
3900 for( ; (i+2UL) <= M; i+=2UL )
3901 {
3902 const size_t kbegin( ( IsUpper_v<MT4> )
3903 ?( ( IsLower_v<MT5> )
3904 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3905 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3906 :( IsLower_v<MT5> ? j : 0UL ) );
3907 const size_t kend( ( IsLower_v<MT4> )
3908 ?( ( IsUpper_v<MT5> )
3909 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
3910 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3911 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
3912
3913 SIMDType xmm1( C.load(i ,j ) );
3914 SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
3915 SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
3916 SIMDType xmm4( C.load(i ,j+SIMDSIZE*3UL) );
3917 SIMDType xmm5( C.load(i+1UL,j ) );
3918 SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE ) );
3919 SIMDType xmm7( C.load(i+1UL,j+SIMDSIZE*2UL) );
3920 SIMDType xmm8( C.load(i+1UL,j+SIMDSIZE*3UL) );
3921
3922 for( size_t k=kbegin; k<kend; ++k ) {
3923 const SIMDType a1( set( A(i ,k) ) );
3924 const SIMDType a2( set( A(i+1UL,k) ) );
3925 const SIMDType b1( B.load(k,j ) );
3926 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3927 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3928 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3929 xmm1 += a1 * b1;
3930 xmm2 += a1 * b2;
3931 xmm3 += a1 * b3;
3932 xmm4 += a1 * b4;
3933 xmm5 += a2 * b1;
3934 xmm6 += a2 * b2;
3935 xmm7 += a2 * b3;
3936 xmm8 += a2 * b4;
3937 }
3938
3939 C.store( i , j , xmm1 );
3940 C.store( i , j+SIMDSIZE , xmm2 );
3941 C.store( i , j+SIMDSIZE*2UL, xmm3 );
3942 C.store( i , j+SIMDSIZE*3UL, xmm4 );
3943 C.store( i+1UL, j , xmm5 );
3944 C.store( i+1UL, j+SIMDSIZE , xmm6 );
3945 C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3946 C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3947 }
3948
3949 if( i < M )
3950 {
3951 const size_t kbegin( ( IsUpper_v<MT4> )
3952 ?( ( IsLower_v<MT5> )
3953 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3954 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3955 :( IsLower_v<MT5> ? j : 0UL ) );
3956 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
3957
3958 SIMDType xmm1( C.load(i,j ) );
3959 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3960 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3961 SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3962
3963 for( size_t k=kbegin; k<kend; ++k ) {
3964 const SIMDType a1( set( A(i,k) ) );
3965 xmm1 += a1 * B.load(k,j );
3966 xmm2 += a1 * B.load(k,j+SIMDSIZE );
3967 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3968 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3969 }
3970
3971 C.store( i, j , xmm1 );
3972 C.store( i, j+SIMDSIZE , xmm2 );
3973 C.store( i, j+SIMDSIZE*2UL, xmm3 );
3974 C.store( i, j+SIMDSIZE*3UL, xmm4 );
3975 }
3976 }
3977
3978 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3979 {
3980 size_t i( 0UL );
3981
3982 for( ; (i+2UL) <= M; i+=2UL )
3983 {
3984 const size_t kbegin( ( IsUpper_v<MT4> )
3985 ?( ( IsLower_v<MT5> )
3986 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3987 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3988 :( IsLower_v<MT5> ? j : 0UL ) );
3989 const size_t kend( ( IsLower_v<MT4> )
3990 ?( ( IsUpper_v<MT5> )
3991 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
3992 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3993 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
3994
3995 SIMDType xmm1( C.load(i ,j ) );
3996 SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
3997 SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
3998 SIMDType xmm4( C.load(i+1UL,j ) );
3999 SIMDType xmm5( C.load(i+1UL,j+SIMDSIZE ) );
4000 SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE*2UL) );
4001
4002 for( size_t k=kbegin; k<kend; ++k ) {
4003 const SIMDType a1( set( A(i ,k) ) );
4004 const SIMDType a2( set( A(i+1UL,k) ) );
4005 const SIMDType b1( B.load(k,j ) );
4006 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4007 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4008 xmm1 += a1 * b1;
4009 xmm2 += a1 * b2;
4010 xmm3 += a1 * b3;
4011 xmm4 += a2 * b1;
4012 xmm5 += a2 * b2;
4013 xmm6 += a2 * b3;
4014 }
4015
4016 C.store( i , j , xmm1 );
4017 C.store( i , j+SIMDSIZE , xmm2 );
4018 C.store( i , j+SIMDSIZE*2UL, xmm3 );
4019 C.store( i+1UL, j , xmm4 );
4020 C.store( i+1UL, j+SIMDSIZE , xmm5 );
4021 C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
4022 }
4023
4024 if( i < M )
4025 {
4026 const size_t kbegin( ( IsUpper_v<MT4> )
4027 ?( ( IsLower_v<MT5> )
4028 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4029 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4030 :( IsLower_v<MT5> ? j : 0UL ) );
4031 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
4032
4033 SIMDType xmm1( C.load(i,j ) );
4034 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
4035 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
4036
4037 for( size_t k=kbegin; k<kend; ++k ) {
4038 const SIMDType a1( set( A(i,k) ) );
4039 xmm1 += a1 * B.load(k,j );
4040 xmm2 += a1 * B.load(k,j+SIMDSIZE );
4041 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
4042 }
4043
4044 C.store( i, j , xmm1 );
4045 C.store( i, j+SIMDSIZE , xmm2 );
4046 C.store( i, j+SIMDSIZE*2UL, xmm3 );
4047 }
4048 }
4049
4050 for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4051 {
4052 const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
4053 size_t i( LOW ? j : 0UL );
4054
4055 for( ; (i+4UL) <= iend; i+=4UL )
4056 {
4057 const size_t kbegin( ( IsUpper_v<MT4> )
4058 ?( ( IsLower_v<MT5> )
4059 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4060 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4061 :( IsLower_v<MT5> ? j : 0UL ) );
4062 const size_t kend( ( IsLower_v<MT4> )
4063 ?( ( IsUpper_v<MT5> )
4064 ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
4065 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
4066 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
4067
4068 SIMDType xmm1( C.load(i ,j ) );
4069 SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
4070 SIMDType xmm3( C.load(i+1UL,j ) );
4071 SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
4072 SIMDType xmm5( C.load(i+2UL,j ) );
4073 SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
4074 SIMDType xmm7( C.load(i+3UL,j ) );
4075 SIMDType xmm8( C.load(i+3UL,j+SIMDSIZE) );
4076
4077 for( size_t k=kbegin; k<kend; ++k ) {
4078 const SIMDType a1( set( A(i ,k) ) );
4079 const SIMDType a2( set( A(i+1UL,k) ) );
4080 const SIMDType a3( set( A(i+2UL,k) ) );
4081 const SIMDType a4( set( A(i+3UL,k) ) );
4082 const SIMDType b1( B.load(k,j ) );
4083 const SIMDType b2( B.load(k,j+SIMDSIZE) );
4084 xmm1 += a1 * b1;
4085 xmm2 += a1 * b2;
4086 xmm3 += a2 * b1;
4087 xmm4 += a2 * b2;
4088 xmm5 += a3 * b1;
4089 xmm6 += a3 * b2;
4090 xmm7 += a4 * b1;
4091 xmm8 += a4 * b2;
4092 }
4093
4094 C.store( i , j , xmm1 );
4095 C.store( i , j+SIMDSIZE, xmm2 );
4096 C.store( i+1UL, j , xmm3 );
4097 C.store( i+1UL, j+SIMDSIZE, xmm4 );
4098 C.store( i+2UL, j , xmm5 );
4099 C.store( i+2UL, j+SIMDSIZE, xmm6 );
4100 C.store( i+3UL, j , xmm7 );
4101 C.store( i+3UL, j+SIMDSIZE, xmm8 );
4102 }
4103
4104 for( ; (i+3UL) <= iend; i+=3UL )
4105 {
4106 const size_t kbegin( ( IsUpper_v<MT4> )
4107 ?( ( IsLower_v<MT5> )
4108 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4109 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4110 :( IsLower_v<MT5> ? j : 0UL ) );
4111 const size_t kend( ( IsLower_v<MT4> )
4112 ?( ( IsUpper_v<MT5> )
4113 ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
4114 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
4115 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
4116
4117 SIMDType xmm1( C.load(i ,j ) );
4118 SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
4119 SIMDType xmm3( C.load(i+1UL,j ) );
4120 SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
4121 SIMDType xmm5( C.load(i+2UL,j ) );
4122 SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
4123
4124 for( size_t k=kbegin; k<kend; ++k ) {
4125 const SIMDType a1( set( A(i ,k) ) );
4126 const SIMDType a2( set( A(i+1UL,k) ) );
4127 const SIMDType a3( set( A(i+2UL,k) ) );
4128 const SIMDType b1( B.load(k,j ) );
4129 const SIMDType b2( B.load(k,j+SIMDSIZE) );
4130 xmm1 += a1 * b1;
4131 xmm2 += a1 * b2;
4132 xmm3 += a2 * b1;
4133 xmm4 += a2 * b2;
4134 xmm5 += a3 * b1;
4135 xmm6 += a3 * b2;
4136 }
4137
4138 C.store( i , j , xmm1 );
4139 C.store( i , j+SIMDSIZE, xmm2 );
4140 C.store( i+1UL, j , xmm3 );
4141 C.store( i+1UL, j+SIMDSIZE, xmm4 );
4142 C.store( i+2UL, j , xmm5 );
4143 C.store( i+2UL, j+SIMDSIZE, xmm6 );
4144 }
4145
4146 for( ; (i+2UL) <= iend; i+=2UL )
4147 {
4148 const size_t kbegin( ( IsUpper_v<MT4> )
4149 ?( ( IsLower_v<MT5> )
4150 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4151 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4152 :( IsLower_v<MT5> ? j : 0UL ) );
4153 const size_t kend( ( IsLower_v<MT4> )
4154 ?( ( IsUpper_v<MT5> )
4155 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
4156 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
4157 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
4158
4159 SIMDType xmm1( C.load(i ,j ) );
4160 SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
4161 SIMDType xmm3( C.load(i+1UL,j ) );
4162 SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
4163
4164 for( size_t k=kbegin; k<kend; ++k ) {
4165 const SIMDType a1( set( A(i ,k) ) );
4166 const SIMDType a2( set( A(i+1UL,k) ) );
4167 const SIMDType b1( B.load(k,j ) );
4168 const SIMDType b2( B.load(k,j+SIMDSIZE) );
4169 xmm1 += a1 * b1;
4170 xmm2 += a1 * b2;
4171 xmm3 += a2 * b1;
4172 xmm4 += a2 * b2;
4173 }
4174
4175 C.store( i , j , xmm1 );
4176 C.store( i , j+SIMDSIZE, xmm2 );
4177 C.store( i+1UL, j , xmm3 );
4178 C.store( i+1UL, j+SIMDSIZE, xmm4 );
4179 }
4180
4181 if( i < iend )
4182 {
4183 const size_t kbegin( ( IsUpper_v<MT4> )
4184 ?( ( IsLower_v<MT5> )
4185 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4186 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4187 :( IsLower_v<MT5> ? j : 0UL ) );
4188 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
4189
4190 SIMDType xmm1( C.load(i,j ) );
4191 SIMDType xmm2( C.load(i,j+SIMDSIZE) );
4192
4193 for( size_t k=kbegin; k<kend; ++k ) {
4194 const SIMDType a1( set( A(i,k) ) );
4195 xmm1 += a1 * B.load(k,j );
4196 xmm2 += a1 * B.load(k,j+SIMDSIZE);
4197 }
4198
4199 C.store( i, j , xmm1 );
4200 C.store( i, j+SIMDSIZE, xmm2 );
4201 }
4202 }
4203
4204 for( ; j<jpos; j+=SIMDSIZE )
4205 {
4206 const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
4207 size_t i( LOW ? j : 0UL );
4208
4209 for( ; (i+4UL) <= iend; i+=4UL )
4210 {
4211 const size_t kbegin( ( IsUpper_v<MT4> )
4212 ?( ( IsLower_v<MT5> )
4213 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4214 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4215 :( IsLower_v<MT5> ? j : 0UL ) );
4216 const size_t kend( ( IsLower_v<MT4> )
4217 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
4218 :( K ) );
4219
4220 SIMDType xmm1( C.load(i ,j) );
4221 SIMDType xmm2( C.load(i+1UL,j) );
4222 SIMDType xmm3( C.load(i+2UL,j) );
4223 SIMDType xmm4( C.load(i+3UL,j) );
4224
4225 for( size_t k=kbegin; k<kend; ++k ) {
4226 const SIMDType b1( B.load(k,j) );
4227 xmm1 += set( A(i ,k) ) * b1;
4228 xmm2 += set( A(i+1UL,k) ) * b1;
4229 xmm3 += set( A(i+2UL,k) ) * b1;
4230 xmm4 += set( A(i+3UL,k) ) * b1;
4231 }
4232
4233 C.store( i , j, xmm1 );
4234 C.store( i+1UL, j, xmm2 );
4235 C.store( i+2UL, j, xmm3 );
4236 C.store( i+3UL, j, xmm4 );
4237 }
4238
4239 for( ; (i+3UL) <= iend; i+=3UL )
4240 {
4241 const size_t kbegin( ( IsUpper_v<MT4> )
4242 ?( ( IsLower_v<MT5> )
4243 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4244 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4245 :( IsLower_v<MT5> ? j : 0UL ) );
4246 const size_t kend( ( IsLower_v<MT4> )
4247 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
4248 :( K ) );
4249
4250 SIMDType xmm1( C.load(i ,j) );
4251 SIMDType xmm2( C.load(i+1UL,j) );
4252 SIMDType xmm3( C.load(i+2UL,j) );
4253
4254 for( size_t k=kbegin; k<kend; ++k ) {
4255 const SIMDType b1( B.load(k,j) );
4256 xmm1 += set( A(i ,k) ) * b1;
4257 xmm2 += set( A(i+1UL,k) ) * b1;
4258 xmm3 += set( A(i+2UL,k) ) * b1;
4259 }
4260
4261 C.store( i , j, xmm1 );
4262 C.store( i+1UL, j, xmm2 );
4263 C.store( i+2UL, j, xmm3 );
4264 }
4265
4266 for( ; (i+2UL) <= iend; i+=2UL )
4267 {
4268 const size_t kbegin( ( IsUpper_v<MT4> )
4269 ?( ( IsLower_v<MT5> )
4270 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4271 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4272 :( IsLower_v<MT5> ? j : 0UL ) );
4273 const size_t kend( ( IsLower_v<MT4> )
4274 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
4275 :( K ) );
4276
4277 SIMDType xmm1( C.load(i ,j) );
4278 SIMDType xmm2( C.load(i+1UL,j) );
4279
4280 for( size_t k=kbegin; k<kend; ++k ) {
4281 const SIMDType b1( B.load(k,j) );
4282 xmm1 += set( A(i ,k) ) * b1;
4283 xmm2 += set( A(i+1UL,k) ) * b1;
4284 }
4285
4286 C.store( i , j, xmm1 );
4287 C.store( i+1UL, j, xmm2 );
4288 }
4289
4290 if( i < iend )
4291 {
4292 const size_t kbegin( ( IsUpper_v<MT4> )
4293 ?( ( IsLower_v<MT5> )
4294 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4295 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4296 :( IsLower_v<MT5> ? j : 0UL ) );
4297
4298 SIMDType xmm1( C.load(i,j) );
4299
4300 for( size_t k=kbegin; k<K; ++k ) {
4301 xmm1 += set( A(i,k) ) * B.load(k,j);
4302 }
4303
4304 C.store( i, j, xmm1 );
4305 }
4306 }
4307
4308 for( ; remainder && j<N; ++j )
4309 {
4310 const size_t iend( UPP ? j+1UL : M );
4311 size_t i( LOW ? j : 0UL );
4312
4313 for( ; (i+2UL) <= iend; i+=2UL )
4314 {
4315 const size_t kbegin( ( IsUpper_v<MT4> )
4316 ?( ( IsLower_v<MT5> )
4317 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4318 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4319 :( IsLower_v<MT5> ? j : 0UL ) );
4320 const size_t kend( ( IsLower_v<MT4> )
4321 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
4322 :( K ) );
4323
4324 ElementType value1( C(i ,j) );
4325 ElementType value2( C(i+1UL,j) );;
4326
4327 for( size_t k=kbegin; k<kend; ++k ) {
4328 value1 += A(i ,k) * B(k,j);
4329 value2 += A(i+1UL,k) * B(k,j);
4330 }
4331
4332 C(i ,j) = value1;
4333 C(i+1UL,j) = value2;
4334 }
4335
4336 if( i < iend )
4337 {
4338 const size_t kbegin( ( IsUpper_v<MT4> )
4339 ?( ( IsLower_v<MT5> )
4340 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4341 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4342 :( IsLower_v<MT5> ? j : 0UL ) );
4343
4344 ElementType value( C(i,j) );
4345
4346 for( size_t k=kbegin; k<K; ++k ) {
4347 value += A(i,k) * B(k,j);
4348 }
4349
4350 C(i,j) = value;
4351 }
4352 }
4353 }
4355 //**********************************************************************************************
4356
4357 //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
4372 template< typename MT3 // Type of the left-hand side target matrix
4373 , typename MT4 // Type of the left-hand side matrix operand
4374 , typename MT5 > // Type of the right-hand side matrix operand
4375 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4376 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4377 {
4378 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
4379
4380 const size_t M( A.rows() );
4381 const size_t N( B.columns() );
4382 const size_t K( A.columns() );
4383
4384 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4385
4386 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
4387 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
4388
4389 size_t i( 0UL );
4390
4391 if( IsIntegral_v<ElementType> )
4392 {
4393 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
4394 for( size_t j=0UL; j<N; ++j )
4395 {
4396 const size_t kbegin( ( IsLower_v<MT5> )
4397 ?( ( IsUpper_v<MT4> )
4398 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4399 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4400 :( IsUpper_v<MT4> ? i : 0UL ) );
4401 const size_t kend( ( IsUpper_v<MT5> )
4402 ?( ( IsLower_v<MT4> )
4403 ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4404 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
4405 :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
4406
4407 SIMDType xmm1( C.load(i ,j) );
4408 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
4409 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
4410 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
4411 SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
4412 SIMDType xmm6( C.load(i+SIMDSIZE*5UL,j) );
4413 SIMDType xmm7( C.load(i+SIMDSIZE*6UL,j) );
4414 SIMDType xmm8( C.load(i+SIMDSIZE*7UL,j) );
4415
4416 for( size_t k=kbegin; k<kend; ++k ) {
4417 const SIMDType b1( set( B(k,j) ) );
4418 xmm1 += A.load(i ,k) * b1;
4419 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4420 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4421 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
4422 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
4423 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
4424 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
4425 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
4426 }
4427
4428 C.store( i , j, xmm1 );
4429 C.store( i+SIMDSIZE , j, xmm2 );
4430 C.store( i+SIMDSIZE*2UL, j, xmm3 );
4431 C.store( i+SIMDSIZE*3UL, j, xmm4 );
4432 C.store( i+SIMDSIZE*4UL, j, xmm5 );
4433 C.store( i+SIMDSIZE*5UL, j, xmm6 );
4434 C.store( i+SIMDSIZE*6UL, j, xmm7 );
4435 C.store( i+SIMDSIZE*7UL, j, xmm8 );
4436 }
4437 }
4438 }
4439
4440 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
4441 {
4442 size_t j( 0UL );
4443
4444 for( ; (j+2UL) <= N; j+=2UL )
4445 {
4446 const size_t kbegin( ( IsLower_v<MT5> )
4447 ?( ( IsUpper_v<MT4> )
4448 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4449 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4450 :( IsUpper_v<MT4> ? i : 0UL ) );
4451 const size_t kend( ( IsUpper_v<MT5> )
4452 ?( ( IsLower_v<MT4> )
4453 ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4454 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4455 :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
4456
4457 SIMDType xmm1 ( C.load(i ,j ) );
4458 SIMDType xmm2 ( C.load(i+SIMDSIZE ,j ) );
4459 SIMDType xmm3 ( C.load(i+SIMDSIZE*2UL,j ) );
4460 SIMDType xmm4 ( C.load(i+SIMDSIZE*3UL,j ) );
4461 SIMDType xmm5 ( C.load(i+SIMDSIZE*4UL,j ) );
4462 SIMDType xmm6 ( C.load(i ,j+1UL) );
4463 SIMDType xmm7 ( C.load(i+SIMDSIZE ,j+1UL) );
4464 SIMDType xmm8 ( C.load(i+SIMDSIZE*2UL,j+1UL) );
4465 SIMDType xmm9 ( C.load(i+SIMDSIZE*3UL,j+1UL) );
4466 SIMDType xmm10( C.load(i+SIMDSIZE*4UL,j+1UL) );
4467
4468 for( size_t k=kbegin; k<kend; ++k ) {
4469 const SIMDType a1( A.load(i ,k) );
4470 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4471 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4472 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4473 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
4474 const SIMDType b1( set( B(k,j ) ) );
4475 const SIMDType b2( set( B(k,j+1UL) ) );
4476 xmm1 += a1 * b1;
4477 xmm2 += a2 * b1;
4478 xmm3 += a3 * b1;
4479 xmm4 += a4 * b1;
4480 xmm5 += a5 * b1;
4481 xmm6 += a1 * b2;
4482 xmm7 += a2 * b2;
4483 xmm8 += a3 * b2;
4484 xmm9 += a4 * b2;
4485 xmm10 += a5 * b2;
4486 }
4487
4488 C.store( i , j , xmm1 );
4489 C.store( i+SIMDSIZE , j , xmm2 );
4490 C.store( i+SIMDSIZE*2UL, j , xmm3 );
4491 C.store( i+SIMDSIZE*3UL, j , xmm4 );
4492 C.store( i+SIMDSIZE*4UL, j , xmm5 );
4493 C.store( i , j+1UL, xmm6 );
4494 C.store( i+SIMDSIZE , j+1UL, xmm7 );
4495 C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
4496 C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
4497 C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
4498 }
4499
4500 if( j < N )
4501 {
4502 const size_t kbegin( ( IsLower_v<MT5> )
4503 ?( ( IsUpper_v<MT4> )
4504 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4505 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4506 :( IsUpper_v<MT4> ? i : 0UL ) );
4507 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
4508
4509 SIMDType xmm1( C.load(i ,j) );
4510 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
4511 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
4512 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
4513 SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
4514
4515 for( size_t k=kbegin; k<kend; ++k ) {
4516 const SIMDType b1( set( B(k,j) ) );
4517 xmm1 += A.load(i ,k) * b1;
4518 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4519 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4520 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
4521 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
4522 }
4523
4524 C.store( i , j, xmm1 );
4525 C.store( i+SIMDSIZE , j, xmm2 );
4526 C.store( i+SIMDSIZE*2UL, j, xmm3 );
4527 C.store( i+SIMDSIZE*3UL, j, xmm4 );
4528 C.store( i+SIMDSIZE*4UL, j, xmm5 );
4529 }
4530 }
4531
4532 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4533 {
4534 size_t j( 0UL );
4535
4536 for( ; (j+2UL) <= N; j+=2UL )
4537 {
4538 const size_t kbegin( ( IsLower_v<MT5> )
4539 ?( ( IsUpper_v<MT4> )
4540 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4541 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4542 :( IsUpper_v<MT4> ? i : 0UL ) );
4543 const size_t kend( ( IsUpper_v<MT5> )
4544 ?( ( IsLower_v<MT4> )
4545 ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4546 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4547 :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
4548
4549 SIMDType xmm1( C.load(i ,j ) );
4550 SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
4551 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
4552 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j ) );
4553 SIMDType xmm5( C.load(i ,j+1UL) );
4554 SIMDType xmm6( C.load(i+SIMDSIZE ,j+1UL) );
4555 SIMDType xmm7( C.load(i+SIMDSIZE*2UL,j+1UL) );
4556 SIMDType xmm8( C.load(i+SIMDSIZE*3UL,j+1UL) );
4557
4558 for( size_t k=kbegin; k<kend; ++k ) {
4559 const SIMDType a1( A.load(i ,k) );
4560 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4561 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4562 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4563 const SIMDType b1( set( B(k,j ) ) );
4564 const SIMDType b2( set( B(k,j+1UL) ) );
4565 xmm1 += a1 * b1;
4566 xmm2 += a2 * b1;
4567 xmm3 += a3 * b1;
4568 xmm4 += a4 * b1;
4569 xmm5 += a1 * b2;
4570 xmm6 += a2 * b2;
4571 xmm7 += a3 * b2;
4572 xmm8 += a4 * b2;
4573 }
4574
4575 C.store( i , j , xmm1 );
4576 C.store( i+SIMDSIZE , j , xmm2 );
4577 C.store( i+SIMDSIZE*2UL, j , xmm3 );
4578 C.store( i+SIMDSIZE*3UL, j , xmm4 );
4579 C.store( i , j+1UL, xmm5 );
4580 C.store( i+SIMDSIZE , j+1UL, xmm6 );
4581 C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
4582 C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
4583 }
4584
4585 if( j < N )
4586 {
4587 const size_t kbegin( ( IsLower_v<MT5> )
4588 ?( ( IsUpper_v<MT4> )
4589 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4590 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4591 :( IsUpper_v<MT4> ? i : 0UL ) );
4592 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
4593
4594 SIMDType xmm1( C.load(i ,j) );
4595 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
4596 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
4597 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
4598
4599 for( size_t k=kbegin; k<kend; ++k ) {
4600 const SIMDType b1( set( B(k,j) ) );
4601 xmm1 += A.load(i ,k) * b1;
4602 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4603 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4604 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
4605 }
4606
4607 C.store( i , j, xmm1 );
4608 C.store( i+SIMDSIZE , j, xmm2 );
4609 C.store( i+SIMDSIZE*2UL, j, xmm3 );
4610 C.store( i+SIMDSIZE*3UL, j, xmm4 );
4611 }
4612 }
4613
4614 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4615 {
4616 size_t j( 0UL );
4617
4618 for( ; (j+2UL) <= N; j+=2UL )
4619 {
4620 const size_t kbegin( ( IsLower_v<MT5> )
4621 ?( ( IsUpper_v<MT4> )
4622 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4623 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4624 :( IsUpper_v<MT4> ? i : 0UL ) );
4625 const size_t kend( ( IsUpper_v<MT5> )
4626 ?( ( IsLower_v<MT4> )
4627 ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4628 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4629 :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
4630
4631 SIMDType xmm1( C.load(i ,j ) );
4632 SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
4633 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
4634 SIMDType xmm4( C.load(i ,j+1UL) );
4635 SIMDType xmm5( C.load(i+SIMDSIZE ,j+1UL) );
4636 SIMDType xmm6( C.load(i+SIMDSIZE*2UL,j+1UL) );
4637
4638 for( size_t k=kbegin; k<kend; ++k ) {
4639 const SIMDType a1( A.load(i ,k) );
4640 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4641 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4642 const SIMDType b1( set( B(k,j ) ) );
4643 const SIMDType b2( set( B(k,j+1UL) ) );
4644 xmm1 += a1 * b1;
4645 xmm2 += a2 * b1;
4646 xmm3 += a3 * b1;
4647 xmm4 += a1 * b2;
4648 xmm5 += a2 * b2;
4649 xmm6 += a3 * b2;
4650 }
4651
4652 C.store( i , j , xmm1 );
4653 C.store( i+SIMDSIZE , j , xmm2 );
4654 C.store( i+SIMDSIZE*2UL, j , xmm3 );
4655 C.store( i , j+1UL, xmm4 );
4656 C.store( i+SIMDSIZE , j+1UL, xmm5 );
4657 C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
4658 }
4659
4660 if( j < N )
4661 {
4662 const size_t kbegin( ( IsLower_v<MT5> )
4663 ?( ( IsUpper_v<MT4> )
4664 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4665 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4666 :( IsUpper_v<MT4> ? i : 0UL ) );
4667 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
4668
4669 SIMDType xmm1( C.load(i ,j) );
4670 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
4671 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
4672
4673 for( size_t k=kbegin; k<kend; ++k ) {
4674 const SIMDType b1( set( B(k,j) ) );
4675 xmm1 += A.load(i ,k) * b1;
4676 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4677 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4678 }
4679
4680 C.store( i , j, xmm1 );
4681 C.store( i+SIMDSIZE , j, xmm2 );
4682 C.store( i+SIMDSIZE*2UL, j, xmm3 );
4683 }
4684 }
4685
4686 for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4687 {
4688 const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
4689 size_t j( UPP ? i : 0UL );
4690
4691 for( ; (j+4UL) <= jend; j+=4UL )
4692 {
4693 const size_t kbegin( ( IsLower_v<MT5> )
4694 ?( ( IsUpper_v<MT4> )
4695 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4696 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4697 :( IsUpper_v<MT4> ? i : 0UL ) );
4698 const size_t kend( ( IsUpper_v<MT5> )
4699 ?( ( IsLower_v<MT4> )
4700 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
4701 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
4702 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
4703
4704 SIMDType xmm1( C.load(i ,j ) );
4705 SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
4706 SIMDType xmm3( C.load(i ,j+1UL) );
4707 SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
4708 SIMDType xmm5( C.load(i ,j+2UL) );
4709 SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
4710 SIMDType xmm7( C.load(i ,j+3UL) );
4711 SIMDType xmm8( C.load(i+SIMDSIZE,j+3UL) );
4712
4713 for( size_t k=kbegin; k<kend; ++k ) {
4714 const SIMDType a1( A.load(i ,k) );
4715 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4716 const SIMDType b1( set( B(k,j ) ) );
4717 const SIMDType b2( set( B(k,j+1UL) ) );
4718 const SIMDType b3( set( B(k,j+2UL) ) );
4719 const SIMDType b4( set( B(k,j+3UL) ) );
4720 xmm1 += a1 * b1;
4721 xmm2 += a2 * b1;
4722 xmm3 += a1 * b2;
4723 xmm4 += a2 * b2;
4724 xmm5 += a1 * b3;
4725 xmm6 += a2 * b3;
4726 xmm7 += a1 * b4;
4727 xmm8 += a2 * b4;
4728 }
4729
4730 C.store( i , j , xmm1 );
4731 C.store( i+SIMDSIZE, j , xmm2 );
4732 C.store( i , j+1UL, xmm3 );
4733 C.store( i+SIMDSIZE, j+1UL, xmm4 );
4734 C.store( i , j+2UL, xmm5 );
4735 C.store( i+SIMDSIZE, j+2UL, xmm6 );
4736 C.store( i , j+3UL, xmm7 );
4737 C.store( i+SIMDSIZE, j+3UL, xmm8 );
4738 }
4739
4740 for( ; (j+3UL) <= jend; j+=3UL )
4741 {
4742 const size_t kbegin( ( IsLower_v<MT5> )
4743 ?( ( IsUpper_v<MT4> )
4744 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4745 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4746 :( IsUpper_v<MT4> ? i : 0UL ) );
4747 const size_t kend( ( IsUpper_v<MT5> )
4748 ?( ( IsLower_v<MT4> )
4749 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
4750 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
4751 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
4752
4753 SIMDType xmm1( C.load(i ,j ) );
4754 SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
4755 SIMDType xmm3( C.load(i ,j+1UL) );
4756 SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
4757 SIMDType xmm5( C.load(i ,j+2UL) );
4758 SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
4759
4760 for( size_t k=kbegin; k<kend; ++k ) {
4761 const SIMDType a1( A.load(i ,k) );
4762 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4763 const SIMDType b1( set( B(k,j ) ) );
4764 const SIMDType b2( set( B(k,j+1UL) ) );
4765 const SIMDType b3( set( B(k,j+2UL) ) );
4766 xmm1 += a1 * b1;
4767 xmm2 += a2 * b1;
4768 xmm3 += a1 * b2;
4769 xmm4 += a2 * b2;
4770 xmm5 += a1 * b3;
4771 xmm6 += a2 * b3;
4772 }
4773
4774 C.store( i , j , xmm1 );
4775 C.store( i+SIMDSIZE, j , xmm2 );
4776 C.store( i , j+1UL, xmm3 );
4777 C.store( i+SIMDSIZE, j+1UL, xmm4 );
4778 C.store( i , j+2UL, xmm5 );
4779 C.store( i+SIMDSIZE, j+2UL, xmm6 );
4780 }
4781
4782 for( ; (j+2UL) <= jend; j+=2UL )
4783 {
4784 const size_t kbegin( ( IsLower_v<MT5> )
4785 ?( ( IsUpper_v<MT4> )
4786 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4787 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4788 :( IsUpper_v<MT4> ? i : 0UL ) );
4789 const size_t kend( ( IsUpper_v<MT5> )
4790 ?( ( IsLower_v<MT4> )
4791 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4792 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4793 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
4794
4795 SIMDType xmm1( C.load(i ,j ) );
4796 SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
4797 SIMDType xmm3( C.load(i ,j+1UL) );
4798 SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
4799
4800 for( size_t k=kbegin; k<kend; ++k ) {
4801 const SIMDType a1( A.load(i ,k) );
4802 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4803 const SIMDType b1( set( B(k,j ) ) );
4804 const SIMDType b2( set( B(k,j+1UL) ) );
4805 xmm1 += a1 * b1;
4806 xmm2 += a2 * b1;
4807 xmm3 += a1 * b2;
4808 xmm4 += a2 * b2;
4809 }
4810
4811 C.store( i , j , xmm1 );
4812 C.store( i+SIMDSIZE, j , xmm2 );
4813 C.store( i , j+1UL, xmm3 );
4814 C.store( i+SIMDSIZE, j+1UL, xmm4 );
4815 }
4816
4817 if( j < jend )
4818 {
4819 const size_t kbegin( ( IsLower_v<MT5> )
4820 ?( ( IsUpper_v<MT4> )
4821 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4822 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4823 :( IsUpper_v<MT4> ? i : 0UL ) );
4824 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
4825
4826 SIMDType xmm1( C.load(i ,j) );
4827 SIMDType xmm2( C.load(i+SIMDSIZE,j) );
4828
4829 for( size_t k=kbegin; k<kend; ++k ) {
4830 const SIMDType b1( set( B(k,j) ) );
4831 xmm1 += A.load(i ,k) * b1;
4832 xmm2 += A.load(i+SIMDSIZE,k) * b1;
4833 }
4834
4835 C.store( i , j, xmm1 );
4836 C.store( i+SIMDSIZE, j, xmm2 );
4837 }
4838 }
4839
4840 for( ; i<ipos; i+=SIMDSIZE )
4841 {
4842 const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
4843 size_t j( UPP ? i : 0UL );
4844
4845 for( ; (j+4UL) <= jend; j+=4UL )
4846 {
4847 const size_t kbegin( ( IsLower_v<MT5> )
4848 ?( ( IsUpper_v<MT4> )
4849 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4850 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4851 :( IsUpper_v<MT4> ? i : 0UL ) );
4852 const size_t kend( ( IsUpper_v<MT5> )
4853 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
4854 :( K ) );
4855
4856 SIMDType xmm1( C.load(i,j ) );
4857 SIMDType xmm2( C.load(i,j+1UL) );
4858 SIMDType xmm3( C.load(i,j+2UL) );
4859 SIMDType xmm4( C.load(i,j+3UL) );
4860
4861 for( size_t k=kbegin; k<kend; ++k ) {
4862 const SIMDType a1( A.load(i,k) );
4863 xmm1 += a1 * set( B(k,j ) );
4864 xmm2 += a1 * set( B(k,j+1UL) );
4865 xmm3 += a1 * set( B(k,j+2UL) );
4866 xmm4 += a1 * set( B(k,j+3UL) );
4867 }
4868
4869 C.store( i, j , xmm1 );
4870 C.store( i, j+1UL, xmm2 );
4871 C.store( i, j+2UL, xmm3 );
4872 C.store( i, j+3UL, xmm4 );
4873 }
4874
4875 for( ; (j+3UL) <= jend; j+=3UL )
4876 {
4877 const size_t kbegin( ( IsLower_v<MT5> )
4878 ?( ( IsUpper_v<MT4> )
4879 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4880 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4881 :( IsUpper_v<MT4> ? i : 0UL ) );
4882 const size_t kend( ( IsUpper_v<MT5> )
4883 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
4884 :( K ) );
4885
4886 SIMDType xmm1( C.load(i,j ) );
4887 SIMDType xmm2( C.load(i,j+1UL) );
4888 SIMDType xmm3( C.load(i,j+2UL) );
4889
4890 for( size_t k=kbegin; k<kend; ++k ) {
4891 const SIMDType a1( A.load(i,k) );
4892 xmm1 += a1 * set( B(k,j ) );
4893 xmm2 += a1 * set( B(k,j+1UL) );
4894 xmm3 += a1 * set( B(k,j+2UL) );
4895 }
4896
4897 C.store( i, j , xmm1 );
4898 C.store( i, j+1UL, xmm2 );
4899 C.store( i, j+2UL, xmm3 );
4900 }
4901
4902 for( ; (j+2UL) <= jend; j+=2UL )
4903 {
4904 const size_t kbegin( ( IsLower_v<MT5> )
4905 ?( ( IsUpper_v<MT4> )
4906 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4907 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4908 :( IsUpper_v<MT4> ? i : 0UL ) );
4909 const size_t kend( ( IsUpper_v<MT5> )
4910 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4911 :( K ) );
4912
4913 SIMDType xmm1( C.load(i,j ) );
4914 SIMDType xmm2( C.load(i,j+1UL) );
4915
4916 for( size_t k=kbegin; k<kend; ++k ) {
4917 const SIMDType a1( A.load(i,k) );
4918 xmm1 += a1 * set( B(k,j ) );
4919 xmm2 += a1 * set( B(k,j+1UL) );
4920 }
4921
4922 C.store( i, j , xmm1 );
4923 C.store( i, j+1UL, xmm2 );
4924 }
4925
4926 if( j < jend )
4927 {
4928 const size_t kbegin( ( IsLower_v<MT5> )
4929 ?( ( IsUpper_v<MT4> )
4930 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4931 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4932 :( IsUpper_v<MT4> ? i : 0UL ) );
4933
4934 SIMDType xmm1( C.load(i,j) );
4935
4936 for( size_t k=kbegin; k<K; ++k ) {
4937 xmm1 += A.load(i,k) * set( B(k,j) );
4938 }
4939
4940 C.store( i, j, xmm1 );
4941 }
4942 }
4943
4944 for( ; remainder && i<M; ++i )
4945 {
4946 const size_t jend( LOW ? i+1UL : N );
4947 size_t j( UPP ? i : 0UL );
4948
4949 for( ; (j+2UL) <= jend; j+=2UL )
4950 {
4951 const size_t kbegin( ( IsLower_v<MT5> )
4952 ?( ( IsUpper_v<MT4> )
4953 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4954 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4955 :( IsUpper_v<MT4> ? i : 0UL ) );
4956 const size_t kend( ( IsUpper_v<MT5> )
4957 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4958 :( K ) );
4959
4960 ElementType value1( C(i,j ) );
4961 ElementType value2( C(i,j+1UL) );
4962
4963 for( size_t k=kbegin; k<kend; ++k ) {
4964 value1 += A(i,k) * B(k,j );
4965 value2 += A(i,k) * B(k,j+1UL);
4966 }
4967
4968 C(i,j ) = value1;
4969 C(i,j+1UL) = value2;
4970 }
4971
4972 if( j < jend )
4973 {
4974 const size_t kbegin( ( IsLower_v<MT5> )
4975 ?( ( IsUpper_v<MT4> )
4976 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4977 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4978 :( IsUpper_v<MT4> ? i : 0UL ) );
4979
4980 ElementType value( C(i,j) );
4981
4982 for( size_t k=kbegin; k<K; ++k ) {
4983 value += A(i,k) * B(k,j);
4984 }
4985
4986 C(i,j) = value;
4987 }
4988 }
4989 }
4991 //**********************************************************************************************
4992
4993 //**Default addition assignment to dense matrices (large matrices)******************************
5007 template< typename MT3 // Type of the left-hand side target matrix
5008 , typename MT4 // Type of the left-hand side matrix operand
5009 , typename MT5 > // Type of the right-hand side matrix operand
5010 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
5011 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5012 {
5013 selectDefaultAddAssignKernel( C, A, B );
5014 }
5016 //**********************************************************************************************
5017
5018 //**Vectorized default addition assignment to dense matrices (large matrices)*******************
5033 template< typename MT3 // Type of the left-hand side target matrix
5034 , typename MT4 // Type of the left-hand side matrix operand
5035 , typename MT5 > // Type of the right-hand side matrix operand
5036 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
5037 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5038 {
5039 if( LOW )
5040 lmmm( C, A, B, ElementType(1), ElementType(1) );
5041 else if( UPP )
5042 ummm( C, A, B, ElementType(1), ElementType(1) );
5043 else
5044 mmm( C, A, B, ElementType(1), ElementType(1) );
5045 }
5047 //**********************************************************************************************
5048
5049 //**BLAS-based addition assignment to dense matrices (default)**********************************
5063 template< typename MT3 // Type of the left-hand side target matrix
5064 , typename MT4 // Type of the left-hand side matrix operand
5065 , typename MT5 > // Type of the right-hand side matrix operand
5066 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
5067 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
5068 {
5069 selectLargeAddAssignKernel( C, A, B );
5070 }
5072 //**********************************************************************************************
5073
5074 //**BLAS-based addition assignment to dense matrices********************************************
5075#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
5089 template< typename MT3 // Type of the left-hand side target matrix
5090 , typename MT4 // Type of the left-hand side matrix operand
5091 , typename MT5 > // Type of the right-hand side matrix operand
5092 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
5093 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
5094 {
5095 using ET = ElementType_t<MT3>;
5096
5097 if( IsTriangular_v<MT4> ) {
5098 ResultType_t<MT3> tmp( serial( B ) );
5099 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
5100 addAssign( C, tmp );
5101 }
5102 else if( IsTriangular_v<MT5> ) {
5103 ResultType_t<MT3> tmp( serial( A ) );
5104 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
5105 addAssign( C, tmp );
5106 }
5107 else {
5108 gemm( C, A, B, ET(1), ET(1) );
5109 }
5110 }
5112#endif
5113 //**********************************************************************************************
5114
5115 //**Addition assignment to sparse matrices******************************************************
5116 // No special implementation for the addition assignment to sparse matrices.
5117 //**********************************************************************************************
5118
5119 //**Subtraction assignment to dense matrices****************************************************
5132 template< typename MT // Type of the target dense matrix
5133 , bool SO > // Storage order of the target dense matrix
5134 friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5135 {
5137
5138 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
5139 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
5140
5141 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5142 return;
5143 }
5144
5145 LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
5146 RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
5147
5148 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5149 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5150 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5151 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5152 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
5153 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
5154
5155 TDMatDMatMultExpr::selectSubAssignKernel( *lhs, A, B );
5156 }
5158 //**********************************************************************************************
5159
5160 //**Subtraction assignment to dense matrices (kernel selection)*********************************
5171 template< typename MT3 // Type of the left-hand side target matrix
5172 , typename MT4 // Type of the left-hand side matrix operand
5173 , typename MT5 > // Type of the right-hand side matrix operand
5174 static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5175 {
5176 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
5177 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
5178 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
5179 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
5180 selectSmallSubAssignKernel( C, A, B );
5181 else
5182 selectBlasSubAssignKernel( C, A, B );
5183 }
5185 //**********************************************************************************************
5186
5187 //**Default subtraction assignment to row-major dense matrices (general/general)****************
5201 template< typename MT3 // Type of the left-hand side target matrix
5202 , typename MT4 // Type of the left-hand side matrix operand
5203 , typename MT5 > // Type of the right-hand side matrix operand
5204 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5205 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5206 {
5207 const size_t M( A.rows() );
5208 const size_t N( B.columns() );
5209 const size_t K( A.columns() );
5210
5211 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5212
5213 for( size_t i=0UL; i<M; ++i )
5214 {
5215 const size_t kbegin( ( IsUpper_v<MT4> )
5216 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5217 :( 0UL ) );
5218 const size_t kend( ( IsLower_v<MT4> )
5219 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5220 :( K ) );
5221 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
5222
5223 for( size_t k=kbegin; k<kend; ++k )
5224 {
5225 const size_t jbegin( ( IsUpper_v<MT5> )
5226 ?( ( IsStrictlyUpper_v<MT5> )
5227 ?( UPP ? max(i,k+1UL) : k+1UL )
5228 :( UPP ? max(i,k) : k ) )
5229 :( UPP ? i : 0UL ) );
5230 const size_t jend( ( IsLower_v<MT5> )
5231 ?( ( IsStrictlyLower_v<MT5> )
5232 ?( LOW ? min(i+1UL,k) : k )
5233 :( LOW ? min(i,k)+1UL : k+1UL ) )
5234 :( LOW ? i+1UL : N ) );
5235
5236 if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
5237 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5238
5239 const size_t jnum( jend - jbegin );
5240 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
5241 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
5242
5243 for( size_t j=jbegin; j<jpos; j+=2UL ) {
5244 C(i,j ) -= A(i,k) * B(k,j );
5245 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
5246 }
5247 if( jpos < jend ) {
5248 C(i,jpos) -= A(i,k) * B(k,jpos);
5249 }
5250 }
5251 }
5252 }
5254 //**********************************************************************************************
5255
5256 //**Default subtraction assignment to column-major dense matrices (general/general)*************
5270 template< typename MT3 // Type of the left-hand side target matrix
5271 , typename MT4 // Type of the left-hand side matrix operand
5272 , typename MT5 > // Type of the right-hand side matrix operand
5273 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5274 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5275 {
5276 const size_t M( A.rows() );
5277 const size_t N( B.columns() );
5278 const size_t K( A.columns() );
5279
5280 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5281
5282 for( size_t j=0UL; j<N; ++j )
5283 {
5284 const size_t kbegin( ( IsLower_v<MT5> )
5285 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5286 :( 0UL ) );
5287 const size_t kend( ( IsUpper_v<MT5> )
5288 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5289 :( K ) );
5290 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
5291
5292 for( size_t k=kbegin; k<kend; ++k )
5293 {
5294 const size_t ibegin( ( IsLower_v<MT4> )
5295 ?( ( IsStrictlyLower_v<MT4> )
5296 ?( LOW ? max(j,k+1UL) : k+1UL )
5297 :( LOW ? max(j,k) : k ) )
5298 :( LOW ? j : 0UL ) );
5299 const size_t iend( ( IsUpper_v<MT4> )
5300 ?( ( IsStrictlyUpper_v<MT4> )
5301 ?( UPP ? min(j+1UL,k) : k )
5302 :( UPP ? min(j,k)+1UL : k+1UL ) )
5303 :( UPP ? j+1UL : M ) );
5304
5305 if( ( LOW || UPP ) && ( ibegin >= iend ) ) continue;
5306 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5307
5308 const size_t inum( iend - ibegin );
5309 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
5310 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
5311
5312 for( size_t i=ibegin; i<ipos; i+=2UL ) {
5313 C(i ,j) -= A(i ,k) * B(k,j);
5314 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5315 }
5316 if( ipos < iend ) {
5317 C(ipos,j) -= A(ipos,k) * B(k,j);
5318 }
5319 }
5320 }
5321 }
5323 //**********************************************************************************************
5324
5325 //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
5339 template< typename MT3 // Type of the left-hand side target matrix
5340 , typename MT4 // Type of the left-hand side matrix operand
5341 , typename MT5 > // Type of the right-hand side matrix operand
5342 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5343 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5344 {
5345 constexpr size_t block( BLOCK_SIZE );
5346
5347 const size_t M( A.rows() );
5348 const size_t N( B.columns() );
5349
5350 for( size_t ii=0UL; ii<M; ii+=block ) {
5351 const size_t iend( min( M, ii+block ) );
5352 for( size_t jj=0UL; jj<N; jj+=block ) {
5353 const size_t jend( min( N, jj+block ) );
5354 for( size_t i=ii; i<iend; ++i )
5355 {
5356 const size_t jbegin( ( IsUpper_v<MT4> )
5357 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
5358 :( jj ) );
5359 const size_t jpos( ( IsLower_v<MT4> )
5360 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
5361 :( jend ) );
5362
5363 for( size_t j=jbegin; j<jpos; ++j ) {
5364 C(i,j) -= A(i,j) * B(j,j);
5365 }
5366 }
5367 }
5368 }
5369 }
5371 //**********************************************************************************************
5372
5373 //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
5387 template< typename MT3 // Type of the left-hand side target matrix
5388 , typename MT4 // Type of the left-hand side matrix operand
5389 , typename MT5 > // Type of the right-hand side matrix operand
5390 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5391 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5392 {
5393 const size_t M( A.rows() );
5394 const size_t N( B.columns() );
5395
5396 for( size_t j=0UL; j<N; ++j )
5397 {
5398 const size_t ibegin( ( IsLower_v<MT4> )
5399 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
5400 :( 0UL ) );
5401 const size_t iend( ( IsUpper_v<MT4> )
5402 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
5403 :( M ) );
5404 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5405
5406 const size_t inum( iend - ibegin );
5407 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
5408 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
5409
5410 for( size_t i=ibegin; i<ipos; i+=2UL ) {
5411 C(i ,j) -= A(i ,j) * B(j,j);
5412 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
5413 }
5414 if( ipos < iend ) {
5415 C(ipos,j) -= A(ipos,j) * B(j,j);
5416 }
5417 }
5418 }
5420 //**********************************************************************************************
5421
5422 //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
5436 template< typename MT3 // Type of the left-hand side target matrix
5437 , typename MT4 // Type of the left-hand side matrix operand
5438 , typename MT5 > // Type of the right-hand side matrix operand
5439 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5440 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5441 {
5442 const size_t M( A.rows() );
5443 const size_t N( B.columns() );
5444
5445 for( size_t i=0UL; i<M; ++i )
5446 {
5447 const size_t jbegin( ( IsUpper_v<MT5> )
5448 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
5449 :( 0UL ) );
5450 const size_t jend( ( IsLower_v<MT5> )
5451 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
5452 :( N ) );
5453 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5454
5455 const size_t jnum( jend - jbegin );
5456 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
5457 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
5458
5459 for( size_t j=jbegin; j<jpos; j+=2UL ) {
5460 C(i,j ) -= A(i,i) * B(i,j );
5461 C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
5462 }
5463 if( jpos < jend ) {
5464 C(i,jpos) -= A(i,i) * B(i,jpos);
5465 }
5466 }
5467 }
5469 //**********************************************************************************************
5470
5471 //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
5485 template< typename MT3 // Type of the left-hand side target matrix
5486 , typename MT4 // Type of the left-hand side matrix operand
5487 , typename MT5 > // Type of the right-hand side matrix operand
5488 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5489 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5490 {
5491 constexpr size_t block( BLOCK_SIZE );
5492
5493 const size_t M( A.rows() );
5494 const size_t N( B.columns() );
5495
5496 for( size_t jj=0UL; jj<N; jj+=block ) {
5497 const size_t jend( min( N, jj+block ) );
5498 for( size_t ii=0UL; ii<M; ii+=block ) {
5499 const size_t iend( min( M, ii+block ) );
5500 for( size_t j=jj; j<jend; ++j )
5501 {
5502 const size_t ibegin( ( IsLower_v<MT5> )
5503 ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
5504 :( ii ) );
5505 const size_t ipos( ( IsUpper_v<MT5> )
5506 ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
5507 :( iend ) );
5508
5509 for( size_t i=ibegin; i<ipos; ++i ) {
5510 C(i,j) -= A(i,i) * B(i,j);
5511 }
5512 }
5513 }
5514 }
5515 }
5517 //**********************************************************************************************
5518
5519 //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
5533 template< typename MT3 // Type of the left-hand side target matrix
5534 , typename MT4 // Type of the left-hand side matrix operand
5535 , typename MT5 > // Type of the right-hand side matrix operand
5536 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5537 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5538 {
5539 for( size_t i=0UL; i<A.rows(); ++i ) {
5540 C(i,i) -= A(i,i) * B(i,i);
5541 }
5542 }
5544 //**********************************************************************************************
5545
5546 //**Default subtraction assignment to dense matrices (small matrices)***************************
5560 template< typename MT3 // Type of the left-hand side target matrix
5561 , typename MT4 // Type of the left-hand side matrix operand
5562 , typename MT5 > // Type of the right-hand side matrix operand
5563 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5564 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5565 {
5566 selectDefaultSubAssignKernel( C, A, B );
5567 }
5569 //**********************************************************************************************
5570
5571 //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
5586 template< typename MT3 // Type of the left-hand side target matrix
5587 , typename MT4 // Type of the left-hand side matrix operand
5588 , typename MT5 > // Type of the right-hand side matrix operand
5589 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5590 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5591 {
5592 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
5593
5594 const size_t M( A.rows() );
5595 const size_t N( B.columns() );
5596 const size_t K( A.columns() );
5597
5598 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5599
5600 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
5601 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
5602
5603 size_t j( 0UL );
5604
5605 if( IsIntegral_v<ElementType> )
5606 {
5607 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5608 for( size_t i=0UL; i<M; ++i )
5609 {
5610 const size_t kbegin( ( IsUpper_v<MT4> )
5611 ?( ( IsLower_v<MT5> )
5612 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5613 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5614 :( IsLower_v<MT5> ? j : 0UL ) );
5615 const size_t kend( ( IsLower_v<MT4> )
5616 ?( ( IsUpper_v<MT5> )
5617 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
5618 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
5619 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
5620
5621 SIMDType xmm1( C.load(i,j ) );
5622 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
5623 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
5624 SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
5625 SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
5626 SIMDType xmm6( C.load(i,j+SIMDSIZE*5UL) );
5627 SIMDType xmm7( C.load(i,j+SIMDSIZE*6UL) );
5628 SIMDType xmm8( C.load(i,j+SIMDSIZE*7UL) );
5629
5630 for( size_t k=kbegin; k<kend; ++k ) {
5631 const SIMDType a1( set( A(i,k) ) );
5632 xmm1 -= a1 * B.load(k,j );
5633 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5634 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5635 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5636 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5637 xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
5638 xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
5639 xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
5640 }
5641
5642 C.store( i, j , xmm1 );
5643 C.store( i, j+SIMDSIZE , xmm2 );
5644 C.store( i, j+SIMDSIZE*2UL, xmm3 );
5645 C.store( i, j+SIMDSIZE*3UL, xmm4 );
5646 C.store( i, j+SIMDSIZE*4UL, xmm5 );
5647 C.store( i, j+SIMDSIZE*5UL, xmm6 );
5648 C.store( i, j+SIMDSIZE*6UL, xmm7 );
5649 C.store( i, j+SIMDSIZE*7UL, xmm8 );
5650 }
5651 }
5652 }
5653
5654 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5655 {
5656 size_t i( 0UL );
5657
5658 for( ; (i+2UL) <= M; i+=2UL )
5659 {
5660 const size_t kbegin( ( IsUpper_v<MT4> )
5661 ?( ( IsLower_v<MT5> )
5662 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5663 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5664 :( IsLower_v<MT5> ? j : 0UL ) );
5665 const size_t kend( ( IsLower_v<MT4> )
5666 ?( ( IsUpper_v<MT5> )
5667 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
5668 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5669 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
5670
5671 SIMDType xmm1 ( C.load(i ,j ) );
5672 SIMDType xmm2 ( C.load(i ,j+SIMDSIZE ) );
5673 SIMDType xmm3 ( C.load(i ,j+SIMDSIZE*2UL) );
5674 SIMDType xmm4 ( C.load(i ,j+SIMDSIZE*3UL) );
5675 SIMDType xmm5 ( C.load(i ,j+SIMDSIZE*4UL) );
5676 SIMDType xmm6 ( C.load(i+1UL,j ) );
5677 SIMDType xmm7 ( C.load(i+1UL,j+SIMDSIZE ) );
5678 SIMDType xmm8 ( C.load(i+1UL,j+SIMDSIZE*2UL) );
5679 SIMDType xmm9 ( C.load(i+1UL,j+SIMDSIZE*3UL) );
5680 SIMDType xmm10( C.load(i+1UL,j+SIMDSIZE*4UL) );
5681
5682 for( size_t k=kbegin; k<kend; ++k ) {
5683 const SIMDType a1( set( A(i ,k) ) );
5684 const SIMDType a2( set( A(i+1UL,k) ) );
5685 const SIMDType b1( B.load(k,j ) );
5686 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5687 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5688 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5689 const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5690 xmm1 -= a1 * b1;
5691 xmm2 -= a1 * b2;
5692 xmm3 -= a1 * b3;
5693 xmm4 -= a1 * b4;
5694 xmm5 -= a1 * b5;
5695 xmm6 -= a2 * b1;
5696 xmm7 -= a2 * b2;
5697 xmm8 -= a2 * b3;
5698 xmm9 -= a2 * b4;
5699 xmm10 -= a2 * b5;
5700 }
5701
5702 C.store( i , j , xmm1 );
5703 C.store( i , j+SIMDSIZE , xmm2 );
5704 C.store( i , j+SIMDSIZE*2UL, xmm3 );
5705 C.store( i , j+SIMDSIZE*3UL, xmm4 );
5706 C.store( i , j+SIMDSIZE*4UL, xmm5 );
5707 C.store( i+1UL, j , xmm6 );
5708 C.store( i+1UL, j+SIMDSIZE , xmm7 );
5709 C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
5710 C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
5711 C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
5712 }
5713
5714 if( i < M )
5715 {
5716 const size_t kbegin( ( IsUpper_v<MT4> )
5717 ?( ( IsLower_v<MT5> )
5718 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5719 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5720 :( IsLower_v<MT5> ? j : 0UL ) );
5721 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
5722
5723 SIMDType xmm1( C.load(i,j ) );
5724 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
5725 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
5726 SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
5727 SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
5728
5729 for( size_t k=kbegin; k<kend; ++k ) {
5730 const SIMDType a1( set( A(i,k) ) );
5731 xmm1 -= a1 * B.load(k,j );
5732 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5733 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5734 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5735 xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5736 }
5737
5738 C.store( i, j , xmm1 );
5739 C.store( i, j+SIMDSIZE , xmm2 );
5740 C.store( i, j+SIMDSIZE*2UL, xmm3 );
5741 C.store( i, j+SIMDSIZE*3UL, xmm4 );
5742 C.store( i, j+SIMDSIZE*4UL, xmm5 );
5743 }
5744 }
5745
5746 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5747 {
5748 size_t i( 0UL );
5749
5750 for( ; (i+2UL) <= M; i+=2UL )
5751 {
5752 const size_t kbegin( ( IsUpper_v<MT4> )
5753 ?( ( IsLower_v<MT5> )
5754 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5755 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5756 :( IsLower_v<MT5> ? j : 0UL ) );
5757 const size_t kend( ( IsLower_v<MT4> )
5758 ?( ( IsUpper_v<MT5> )
5759 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
5760 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5761 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
5762
5763 SIMDType xmm1( C.load(i ,j ) );
5764 SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
5765 SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
5766 SIMDType xmm4( C.load(i ,j+SIMDSIZE*3UL) );
5767 SIMDType xmm5( C.load(i+1UL,j ) );
5768 SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE ) );
5769 SIMDType xmm7( C.load(i+1UL,j+SIMDSIZE*2UL) );
5770 SIMDType xmm8( C.load(i+1UL,j+SIMDSIZE*3UL) );
5771
5772 for( size_t k=kbegin; k<kend; ++k ) {
5773 const SIMDType a1( set( A(i ,k) ) );
5774 const SIMDType a2( set( A(i+1UL,k) ) );
5775 const SIMDType b1( B.load(k,j ) );
5776 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5777 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5778 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5779 xmm1 -= a1 * b1;
5780 xmm2 -= a1 * b2;
5781 xmm3 -= a1 * b3;
5782 xmm4 -= a1 * b4;
5783 xmm5 -= a2 * b1;
5784 xmm6 -= a2 * b2;
5785 xmm7 -= a2 * b3;
5786 xmm8 -= a2 * b4;
5787 }
5788
5789 C.store( i , j , xmm1 );
5790 C.store( i , j+SIMDSIZE , xmm2 );
5791 C.store( i , j+SIMDSIZE*2UL, xmm3 );
5792 C.store( i , j+SIMDSIZE*3UL, xmm4 );
5793 C.store( i+1UL, j , xmm5 );
5794 C.store( i+1UL, j+SIMDSIZE , xmm6 );
5795 C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
5796 C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
5797 }
5798
5799 if( i < M )
5800 {
5801 const size_t kbegin( ( IsUpper_v<MT4> )
5802 ?( ( IsLower_v<MT5> )
5803 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5804 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5805 :( IsLower_v<MT5> ? j : 0UL ) );
5806 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
5807
5808 SIMDType xmm1( C.load(i,j ) );
5809 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
5810 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
5811 SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
5812
5813 for( size_t k=kbegin; k<kend; ++k ) {
5814 const SIMDType a1( set( A(i,k) ) );
5815 xmm1 -= a1 * B.load(k,j );
5816 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5817 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5818 xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5819 }
5820
5821 C.store( i, j , xmm1 );
5822 C.store( i, j+SIMDSIZE , xmm2 );
5823 C.store( i, j+SIMDSIZE*2UL, xmm3 );
5824 C.store( i, j+SIMDSIZE*3UL, xmm4 );
5825 }
5826 }
5827
5828 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5829 {
5830 size_t i( 0UL );
5831
5832 for( ; (i+2UL) <= M; i+=2UL )
5833 {
5834 const size_t kbegin( ( IsUpper_v<MT4> )
5835 ?( ( IsLower_v<MT5> )
5836 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5837 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5838 :( IsLower_v<MT5> ? j : 0UL ) );
5839 const size_t kend( ( IsLower_v<MT4> )
5840 ?( ( IsUpper_v<MT5> )
5841 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
5842 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5843 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
5844
5845 SIMDType xmm1( C.load(i ,j ) );
5846 SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
5847 SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
5848 SIMDType xmm4( C.load(i+1UL,j ) );
5849 SIMDType xmm5( C.load(i+1UL,j+SIMDSIZE ) );
5850 SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE*2UL) );
5851
5852 for( size_t k=kbegin; k<kend; ++k ) {
5853 const SIMDType a1( set( A(i ,k) ) );
5854 const SIMDType a2( set( A(i+1UL,k) ) );
5855 const SIMDType b1( B.load(k,j ) );
5856 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5857 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5858 xmm1 -= a1 * b1;
5859 xmm2 -= a1 * b2;
5860 xmm3 -= a1 * b3;
5861 xmm4 -= a2 * b1;
5862 xmm5 -= a2 * b2;
5863 xmm6 -= a2 * b3;
5864 }
5865
5866 C.store( i , j , xmm1 );
5867 C.store( i , j+SIMDSIZE , xmm2 );
5868 C.store( i , j+SIMDSIZE*2UL, xmm3 );
5869 C.store( i+1UL, j , xmm4 );
5870 C.store( i+1UL, j+SIMDSIZE , xmm5 );
5871 C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
5872 }
5873
5874 if( i < M )
5875 {
5876 const size_t kbegin( ( IsUpper_v<MT4> )
5877 ?( ( IsLower_v<MT5> )
5878 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5879 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5880 :( IsLower_v<MT5> ? j : 0UL ) );
5881 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
5882
5883 SIMDType xmm1( C.load(i,j ) );
5884 SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
5885 SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
5886
5887 for( size_t k=kbegin; k<kend; ++k ) {
5888 const SIMDType a1( set( A(i,k) ) );
5889 xmm1 -= a1 * B.load(k,j );
5890 xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5891 xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5892 }
5893
5894 C.store( i, j , xmm1 );
5895 C.store( i, j+SIMDSIZE , xmm2 );
5896 C.store( i, j+SIMDSIZE*2UL, xmm3 );
5897 }
5898 }
5899
5900 for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5901 {
5902 const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
5903 size_t i( LOW ? j : 0UL );
5904
5905 for( ; (i+4UL) <= iend; i+=4UL )
5906 {
5907 const size_t kbegin( ( IsUpper_v<MT4> )
5908 ?( ( IsLower_v<MT5> )
5909 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5910 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5911 :( IsLower_v<MT5> ? j : 0UL ) );
5912 const size_t kend( ( IsLower_v<MT4> )
5913 ?( ( IsUpper_v<MT5> )
5914 ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
5915 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
5916 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5917
5918 SIMDType xmm1( C.load(i ,j ) );
5919 SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
5920 SIMDType xmm3( C.load(i+1UL,j ) );
5921 SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
5922 SIMDType xmm5( C.load(i+2UL,j ) );
5923 SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
5924 SIMDType xmm7( C.load(i+3UL,j ) );
5925 SIMDType xmm8( C.load(i+3UL,j+SIMDSIZE) );
5926
5927 for( size_t k=kbegin; k<kend; ++k ) {
5928 const SIMDType a1( set( A(i ,k) ) );
5929 const SIMDType a2( set( A(i+1UL,k) ) );
5930 const SIMDType a3( set( A(i+2UL,k) ) );
5931 const SIMDType a4( set( A(i+3UL,k) ) );
5932 const SIMDType b1( B.load(k,j ) );
5933 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5934 xmm1 -= a1 * b1;
5935 xmm2 -= a1 * b2;
5936 xmm3 -= a2 * b1;
5937 xmm4 -= a2 * b2;
5938 xmm5 -= a3 * b1;
5939 xmm6 -= a3 * b2;
5940 xmm7 -= a4 * b1;
5941 xmm8 -= a4 * b2;
5942 }
5943
5944 C.store( i , j , xmm1 );
5945 C.store( i , j+SIMDSIZE, xmm2 );
5946 C.store( i+1UL, j , xmm3 );
5947 C.store( i+1UL, j+SIMDSIZE, xmm4 );
5948 C.store( i+2UL, j , xmm5 );
5949 C.store( i+2UL, j+SIMDSIZE, xmm6 );
5950 C.store( i+3UL, j , xmm7 );
5951 C.store( i+3UL, j+SIMDSIZE, xmm8 );
5952 }
5953
5954 for( ; (i+3UL) <= iend; i+=3UL )
5955 {
5956 const size_t kbegin( ( IsUpper_v<MT4> )
5957 ?( ( IsLower_v<MT5> )
5958 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5959 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5960 :( IsLower_v<MT5> ? j : 0UL ) );
5961 const size_t kend( ( IsLower_v<MT4> )
5962 ?( ( IsUpper_v<MT5> )
5963 ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
5964 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
5965 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5966
5967 SIMDType xmm1( C.load(i ,j ) );
5968 SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
5969 SIMDType xmm3( C.load(i+1UL,j ) );
5970 SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
5971 SIMDType xmm5( C.load(i+2UL,j ) );
5972 SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
5973
5974 for( size_t k=kbegin; k<kend; ++k ) {
5975 const SIMDType a1( set( A(i ,k) ) );
5976 const SIMDType a2( set( A(i+1UL,k) ) );
5977 const SIMDType a3( set( A(i+2UL,k) ) );
5978 const SIMDType b1( B.load(k,j ) );
5979 const SIMDType b2( B.load(k,j+SIMDSIZE) );
5980 xmm1 -= a1 * b1;
5981 xmm2 -= a1 * b2;
5982 xmm3 -= a2 * b1;
5983 xmm4 -= a2 * b2;
5984 xmm5 -= a3 * b1;
5985 xmm6 -= a3 * b2;
5986 }
5987
5988 C.store( i , j , xmm1 );
5989 C.store( i , j+SIMDSIZE, xmm2 );
5990 C.store( i+1UL, j , xmm3 );
5991 C.store( i+1UL, j+SIMDSIZE, xmm4 );
5992 C.store( i+2UL, j , xmm5 );
5993 C.store( i+2UL, j+SIMDSIZE, xmm6 );
5994 }
5995
5996 for( ; (i+2UL) <= iend; i+=2UL )
5997 {
5998 const size_t kbegin( ( IsUpper_v<MT4> )
5999 ?( ( IsLower_v<MT5> )
6000 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6001 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6002 :( IsLower_v<MT5> ? j : 0UL ) );
6003 const size_t kend( ( IsLower_v<MT4> )
6004 ?( ( IsUpper_v<MT5> )
6005 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
6006 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6007 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
6008
6009 SIMDType xmm1( C.load(i ,j ) );
6010 SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
6011 SIMDType xmm3( C.load(i+1UL,j ) );
6012 SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
6013
6014 for( size_t k=kbegin; k<kend; ++k ) {
6015 const SIMDType a1( set( A(i ,k) ) );
6016 const SIMDType a2( set( A(i+1UL,k) ) );
6017 const SIMDType b1( B.load(k,j ) );
6018 const SIMDType b2( B.load(k,j+SIMDSIZE) );
6019 xmm1 -= a1 * b1;
6020 xmm2 -= a1 * b2;
6021 xmm3 -= a2 * b1;
6022 xmm4 -= a2 * b2;
6023 }
6024
6025 C.store( i , j , xmm1 );
6026 C.store( i , j+SIMDSIZE, xmm2 );
6027 C.store( i+1UL, j , xmm3 );
6028 C.store( i+1UL, j+SIMDSIZE, xmm4 );
6029 }
6030
6031 if( i < iend )
6032 {
6033 const size_t kbegin( ( IsUpper_v<MT4> )
6034 ?( ( IsLower_v<MT5> )
6035 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6036 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6037 :( IsLower_v<MT5> ? j : 0UL ) );
6038 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
6039
6040 SIMDType xmm1( C.load(i,j ) );
6041 SIMDType xmm2( C.load(i,j+SIMDSIZE) );
6042
6043 for( size_t k=kbegin; k<kend; ++k ) {
6044 const SIMDType a1( set( A(i,k) ) );
6045 xmm1 -= a1 * B.load(k,j );
6046 xmm2 -= a1 * B.load(k,j+SIMDSIZE);
6047 }
6048
6049 C.store( i, j , xmm1 );
6050 C.store( i, j+SIMDSIZE, xmm2 );
6051 }
6052 }
6053
6054 for( ; j<jpos; j+=SIMDSIZE )
6055 {
6056 const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
6057 size_t i( LOW ? j : 0UL );
6058
6059 for( ; (i+4UL) <= iend; i+=4UL )
6060 {
6061 const size_t kbegin( ( IsUpper_v<MT4> )
6062 ?( ( IsLower_v<MT5> )
6063 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6064 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6065 :( IsLower_v<MT5> ? j : 0UL ) );
6066 const size_t kend( ( IsLower_v<MT4> )
6067 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
6068 :( K ) );
6069
6070 SIMDType xmm1( C.load(i ,j) );
6071 SIMDType xmm2( C.load(i+1UL,j) );
6072 SIMDType xmm3( C.load(i+2UL,j) );
6073 SIMDType xmm4( C.load(i+3UL,j) );
6074
6075 for( size_t k=kbegin; k<kend; ++k ) {
6076 const SIMDType b1( B.load(k,j) );
6077 xmm1 -= set( A(i ,k) ) * b1;
6078 xmm2 -= set( A(i+1UL,k) ) * b1;
6079 xmm3 -= set( A(i+2UL,k) ) * b1;
6080 xmm4 -= set( A(i+3UL,k) ) * b1;
6081 }
6082
6083 C.store( i , j, xmm1 );
6084 C.store( i+1UL, j, xmm2 );
6085 C.store( i+2UL, j, xmm3 );
6086 C.store( i+3UL, j, xmm4 );
6087 }
6088
6089 for( ; (i+3UL) <= iend; i+=3UL )
6090 {
6091 const size_t kbegin( ( IsUpper_v<MT4> )
6092 ?( ( IsLower_v<MT5> )
6093 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6094 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6095 :( IsLower_v<MT5> ? j : 0UL ) );
6096 const size_t kend( ( IsLower_v<MT4> )
6097 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
6098 :( K ) );
6099
6100 SIMDType xmm1( C.load(i ,j) );
6101 SIMDType xmm2( C.load(i+1UL,j) );
6102 SIMDType xmm3( C.load(i+2UL,j) );
6103
6104 for( size_t k=kbegin; k<kend; ++k ) {
6105 const SIMDType b1( B.load(k,j) );
6106 xmm1 -= set( A(i ,k) ) * b1;
6107 xmm2 -= set( A(i+1UL,k) ) * b1;
6108 xmm3 -= set( A(i+2UL,k) ) * b1;
6109 }
6110
6111 C.store( i , j, xmm1 );
6112 C.store( i+1UL, j, xmm2 );
6113 C.store( i+2UL, j, xmm3 );
6114 }
6115
6116 for( ; (i+2UL) <= iend; i+=2UL )
6117 {
6118 const size_t kbegin( ( IsUpper_v<MT4> )
6119 ?( ( IsLower_v<MT5> )
6120 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6121 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6122 :( IsLower_v<MT5> ? j : 0UL ) );
6123 const size_t kend( ( IsLower_v<MT4> )
6124 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
6125 :( K ) );
6126
6127 SIMDType xmm1( C.load(i ,j) );
6128 SIMDType xmm2( C.load(i+1UL,j) );
6129
6130 for( size_t k=kbegin; k<kend; ++k ) {
6131 const SIMDType b1( B.load(k,j) );
6132 xmm1 -= set( A(i ,k) ) * b1;
6133 xmm2 -= set( A(i+1UL,k) ) * b1;
6134 }
6135
6136 C.store( i , j, xmm1 );
6137 C.store( i+1UL, j, xmm2 );
6138 }
6139
6140 if( i < iend )
6141 {
6142 const size_t kbegin( ( IsUpper_v<MT4> )
6143 ?( ( IsLower_v<MT5> )
6144 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6145 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6146 :( IsLower_v<MT5> ? j : 0UL ) );
6147
6148 SIMDType xmm1( C.load(i,j) );
6149
6150 for( size_t k=kbegin; k<K; ++k ) {
6151 xmm1 -= set( A(i,k) ) * B.load(k,j);
6152 }
6153
6154 C.store( i, j, xmm1 );
6155 }
6156 }
6157
6158 for( ; remainder && j<N; ++j )
6159 {
6160 const size_t iend( UPP ? j+1UL : M );
6161 size_t i( LOW ? j : 0UL );
6162
6163 for( ; (i+2UL) <= iend; i+=2UL )
6164 {
6165 const size_t kbegin( ( IsUpper_v<MT4> )
6166 ?( ( IsLower_v<MT5> )
6167 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6168 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6169 :( IsLower_v<MT5> ? j : 0UL ) );
6170 const size_t kend( ( IsLower_v<MT4> )
6171 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
6172 :( K ) );
6173
6174 ElementType value1( C(i ,j) );
6175 ElementType value2( C(i+1UL,j) );
6176
6177 for( size_t k=kbegin; k<kend; ++k ) {
6178 value1 -= A(i ,k) * B(k,j);
6179 value2 -= A(i+1UL,k) * B(k,j);
6180 }
6181
6182 C(i ,j) = value1;
6183 C(i+1UL,j) = value2;
6184 }
6185
6186 if( i < iend )
6187 {
6188 const size_t kbegin( ( IsUpper_v<MT4> )
6189 ?( ( IsLower_v<MT5> )
6190 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6191 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6192 :( IsLower_v<MT5> ? j : 0UL ) );
6193
6194 ElementType value( C(i,j) );
6195
6196 for( size_t k=kbegin; k<K; ++k ) {
6197 value -= A(i,k) * B(k,j);
6198 }
6199
6200 C(i,j) = value;
6201 }
6202 }
6203 }
6205 //**********************************************************************************************
6206
6207 //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
6222 template< typename MT3 // Type of the left-hand side target matrix
6223 , typename MT4 // Type of the left-hand side matrix operand
6224 , typename MT5 > // Type of the right-hand side matrix operand
6225 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6226 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6227 {
6228 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
6229
6230 const size_t M( A.rows() );
6231 const size_t N( B.columns() );
6232 const size_t K( A.columns() );
6233
6234 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6235
6236 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
6237 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
6238
6239 size_t i( 0UL );
6240
6241 if( IsIntegral_v<ElementType> )
6242 {
6243 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6244 for( size_t j=0UL; j<N; ++j )
6245 {
6246 const size_t kbegin( ( IsLower_v<MT5> )
6247 ?( ( IsUpper_v<MT4> )
6248 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6249 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6250 :( IsUpper_v<MT4> ? i : 0UL ) );
6251 const size_t kend( ( IsUpper_v<MT5> )
6252 ?( ( IsLower_v<MT4> )
6253 ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
6254 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
6255 :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
6256
6257 SIMDType xmm1( C.load(i ,j) );
6258 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
6259 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
6260 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
6261 SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
6262 SIMDType xmm6( C.load(i+SIMDSIZE*5UL,j) );
6263 SIMDType xmm7( C.load(i+SIMDSIZE*6UL,j) );
6264 SIMDType xmm8( C.load(i+SIMDSIZE*7UL,j) );
6265
6266 for( size_t k=kbegin; k<kend; ++k ) {
6267 const SIMDType b1( set( B(k,j) ) );
6268 xmm1 -= A.load(i ,k) * b1;
6269 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
6270 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
6271 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
6272 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
6273 xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
6274 xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
6275 xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
6276 }
6277
6278 C.store( i , j, xmm1 );
6279 C.store( i+SIMDSIZE , j, xmm2 );
6280 C.store( i+SIMDSIZE*2UL, j, xmm3 );
6281 C.store( i+SIMDSIZE*3UL, j, xmm4 );
6282 C.store( i+SIMDSIZE*4UL, j, xmm5 );
6283 C.store( i+SIMDSIZE*5UL, j, xmm6 );
6284 C.store( i+SIMDSIZE*6UL, j, xmm7 );
6285 C.store( i+SIMDSIZE*7UL, j, xmm8 );
6286 }
6287 }
6288 }
6289
6290 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
6291 {
6292 size_t j( 0UL );
6293
6294 for( ; (j+2UL) <= N; j+=2UL )
6295 {
6296 const size_t kbegin( ( IsLower_v<MT5> )
6297 ?( ( IsUpper_v<MT4> )
6298 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6299 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6300 :( IsUpper_v<MT4> ? i : 0UL ) );
6301 const size_t kend( ( IsUpper_v<MT5> )
6302 ?( ( IsLower_v<MT4> )
6303 ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6304 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6305 :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
6306
6307 SIMDType xmm1 ( C.load(i ,j ) );
6308 SIMDType xmm2 ( C.load(i+SIMDSIZE ,j ) );
6309 SIMDType xmm3 ( C.load(i+SIMDSIZE*2UL,j ) );
6310 SIMDType xmm4 ( C.load(i+SIMDSIZE*3UL,j ) );
6311 SIMDType xmm5 ( C.load(i+SIMDSIZE*4UL,j ) );
6312 SIMDType xmm6 ( C.load(i ,j+1UL) );
6313 SIMDType xmm7 ( C.load(i+SIMDSIZE ,j+1UL) );
6314 SIMDType xmm8 ( C.load(i+SIMDSIZE*2UL,j+1UL) );
6315 SIMDType xmm9 ( C.load(i+SIMDSIZE*3UL,j+1UL) );
6316 SIMDType xmm10( C.load(i+SIMDSIZE*4UL,j+1UL) );
6317
6318 for( size_t k=kbegin; k<kend; ++k ) {
6319 const SIMDType a1( A.load(i ,k) );
6320 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6321 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6322 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6323 const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
6324 const SIMDType b1( set( B(k,j ) ) );
6325 const SIMDType b2( set( B(k,j+1UL) ) );
6326 xmm1 -= a1 * b1;
6327 xmm2 -= a2 * b1;
6328 xmm3 -= a3 * b1;
6329 xmm4 -= a4 * b1;
6330 xmm5 -= a5 * b1;
6331 xmm6 -= a1 * b2;
6332 xmm7 -= a2 * b2;
6333 xmm8 -= a3 * b2;
6334 xmm9 -= a4 * b2;
6335 xmm10 -= a5 * b2;
6336 }
6337
6338 C.store( i , j , xmm1 );
6339 C.store( i+SIMDSIZE , j , xmm2 );
6340 C.store( i+SIMDSIZE*2UL, j , xmm3 );
6341 C.store( i+SIMDSIZE*3UL, j , xmm4 );
6342 C.store( i+SIMDSIZE*4UL, j , xmm5 );
6343 C.store( i , j+1UL, xmm6 );
6344 C.store( i+SIMDSIZE , j+1UL, xmm7 );
6345 C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
6346 C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
6347 C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
6348 }
6349
6350 if( j < N )
6351 {
6352 const size_t kbegin( ( IsLower_v<MT5> )
6353 ?( ( IsUpper_v<MT4> )
6354 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6355 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6356 :( IsUpper_v<MT4> ? i : 0UL ) );
6357 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
6358
6359 SIMDType xmm1( C.load(i ,j) );
6360 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
6361 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
6362 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
6363 SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
6364
6365 for( size_t k=kbegin; k<kend; ++k ) {
6366 const SIMDType b1( set( B(k,j) ) );
6367 xmm1 -= A.load(i ,k) * b1;
6368 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
6369 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
6370 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
6371 xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
6372 }
6373
6374 C.store( i , j, xmm1 );
6375 C.store( i+SIMDSIZE , j, xmm2 );
6376 C.store( i+SIMDSIZE*2UL, j, xmm3 );
6377 C.store( i+SIMDSIZE*3UL, j, xmm4 );
6378 C.store( i+SIMDSIZE*4UL, j, xmm5 );
6379 }
6380 }
6381
6382 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6383 {
6384 size_t j( 0UL );
6385
6386 for( ; (j+2UL) <= N; j+=2UL )
6387 {
6388 const size_t kbegin( ( IsLower_v<MT5> )
6389 ?( ( IsUpper_v<MT4> )
6390 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6391 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6392 :( IsUpper_v<MT4> ? i : 0UL ) );
6393 const size_t kend( ( IsUpper_v<MT5> )
6394 ?( ( IsLower_v<MT4> )
6395 ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6396 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6397 :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
6398
6399 SIMDType xmm1( C.load(i ,j ) );
6400 SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
6401 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
6402 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j ) );
6403 SIMDType xmm5( C.load(i ,j+1UL) );
6404 SIMDType xmm6( C.load(i+SIMDSIZE ,j+1UL) );
6405 SIMDType xmm7( C.load(i+SIMDSIZE*2UL,j+1UL) );
6406 SIMDType xmm8( C.load(i+SIMDSIZE*3UL,j+1UL) );
6407
6408 for( size_t k=kbegin; k<kend; ++k ) {
6409 const SIMDType a1( A.load(i ,k) );
6410 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6411 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6412 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6413 const SIMDType b1( set( B(k,j ) ) );
6414 const SIMDType b2( set( B(k,j+1UL) ) );
6415 xmm1 -= a1 * b1;
6416 xmm2 -= a2 * b1;
6417 xmm3 -= a3 * b1;
6418 xmm4 -= a4 * b1;
6419 xmm5 -= a1 * b2;
6420 xmm6 -= a2 * b2;
6421 xmm7 -= a3 * b2;
6422 xmm8 -= a4 * b2;
6423 }
6424
6425 C.store( i , j , xmm1 );
6426 C.store( i+SIMDSIZE , j , xmm2 );
6427 C.store( i+SIMDSIZE*2UL, j , xmm3 );
6428 C.store( i+SIMDSIZE*3UL, j , xmm4 );
6429 C.store( i , j+1UL, xmm5 );
6430 C.store( i+SIMDSIZE , j+1UL, xmm6 );
6431 C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
6432 C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
6433 }
6434
6435 if( j < N )
6436 {
6437 const size_t kbegin( ( IsLower_v<MT5> )
6438 ?( ( IsUpper_v<MT4> )
6439 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6440 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6441 :( IsUpper_v<MT4> ? i : 0UL ) );
6442 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
6443
6444 SIMDType xmm1( C.load(i ,j) );
6445 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
6446 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
6447 SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
6448
6449 for( size_t k=kbegin; k<kend; ++k ) {
6450 const SIMDType b1( set( B(k,j) ) );
6451 xmm1 -= A.load(i ,k) * b1;
6452 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
6453 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
6454 xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
6455 }
6456
6457 C.store( i , j, xmm1 );
6458 C.store( i+SIMDSIZE , j, xmm2 );
6459 C.store( i+SIMDSIZE*2UL, j, xmm3 );
6460 C.store( i+SIMDSIZE*3UL, j, xmm4 );
6461 }
6462 }
6463
6464 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
6465 {
6466 size_t j( 0UL );
6467
6468 for( ; (j+2UL) <= N; j+=2UL )
6469 {
6470 const size_t kbegin( ( IsLower_v<MT5> )
6471 ?( ( IsUpper_v<MT4> )
6472 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6473 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6474 :( IsUpper_v<MT4> ? i : 0UL ) );
6475 const size_t kend( ( IsUpper_v<MT5> )
6476 ?( ( IsLower_v<MT4> )
6477 ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6478 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6479 :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
6480
6481 SIMDType xmm1( C.load(i ,j ) );
6482 SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
6483 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
6484 SIMDType xmm4( C.load(i ,j+1UL) );
6485 SIMDType xmm5( C.load(i+SIMDSIZE ,j+1UL) );
6486 SIMDType xmm6( C.load(i+SIMDSIZE*2UL,j+1UL) );
6487
6488 for( size_t k=kbegin; k<kend; ++k ) {
6489 const SIMDType a1( A.load(i ,k) );
6490 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6491 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6492 const SIMDType b1( set( B(k,j ) ) );
6493 const SIMDType b2( set( B(k,j+1UL) ) );
6494 xmm1 -= a1 * b1;
6495 xmm2 -= a2 * b1;
6496 xmm3 -= a3 * b1;
6497 xmm4 -= a1 * b2;
6498 xmm5 -= a2 * b2;
6499 xmm6 -= a3 * b2;
6500 }
6501
6502 C.store( i , j , xmm1 );
6503 C.store( i+SIMDSIZE , j , xmm2 );
6504 C.store( i+SIMDSIZE*2UL, j , xmm3 );
6505 C.store( i , j+1UL, xmm4 );
6506 C.store( i+SIMDSIZE , j+1UL, xmm5 );
6507 C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
6508 }
6509
6510 if( j < N )
6511 {
6512 const size_t kbegin( ( IsLower_v<MT5> )
6513 ?( ( IsUpper_v<MT4> )
6514 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6515 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6516 :( IsUpper_v<MT4> ? i : 0UL ) );
6517 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
6518
6519 SIMDType xmm1( C.load(i ,j) );
6520 SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
6521 SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
6522
6523 for( size_t k=kbegin; k<kend; ++k ) {
6524 const SIMDType b1( set( B(k,j) ) );
6525 xmm1 -= A.load(i ,k) * b1;
6526 xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
6527 xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
6528 }
6529
6530 C.store( i , j, xmm1 );
6531 C.store( i+SIMDSIZE , j, xmm2 );
6532 C.store( i+SIMDSIZE*2UL, j, xmm3 );
6533 }
6534 }
6535
6536 for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6537 {
6538 const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
6539 size_t j( UPP ? i : 0UL );
6540
6541 for( ; (j+4UL) <= jend; j+=4UL )
6542 {
6543 const size_t kbegin( ( IsLower_v<MT5> )
6544 ?( ( IsUpper_v<MT4> )
6545 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6546 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6547 :( IsUpper_v<MT4> ? i : 0UL ) );
6548 const size_t kend( ( IsUpper_v<MT5> )
6549 ?( ( IsLower_v<MT4> )
6550 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
6551 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
6552 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6553
6554 SIMDType xmm1( C.load(i ,j ) );
6555 SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
6556 SIMDType xmm3( C.load(i ,j+1UL) );
6557 SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
6558 SIMDType xmm5( C.load(i ,j+2UL) );
6559 SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
6560 SIMDType xmm7( C.load(i ,j+3UL) );
6561 SIMDType xmm8( C.load(i+SIMDSIZE,j+3UL) );
6562
6563 for( size_t k=kbegin; k<kend; ++k ) {
6564 const SIMDType a1( A.load(i ,k) );
6565 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6566 const SIMDType b1( set( B(k,j ) ) );
6567 const SIMDType b2( set( B(k,j+1UL) ) );
6568 const SIMDType b3( set( B(k,j+2UL) ) );
6569 const SIMDType b4( set( B(k,j+3UL) ) );
6570 xmm1 -= a1 * b1;
6571 xmm2 -= a2 * b1;
6572 xmm3 -= a1 * b2;
6573 xmm4 -= a2 * b2;
6574 xmm5 -= a1 * b3;
6575 xmm6 -= a2 * b3;
6576 xmm7 -= a1 * b4;
6577 xmm8 -= a2 * b4;
6578 }
6579
6580 C.store( i , j , xmm1 );
6581 C.store( i+SIMDSIZE, j , xmm2 );
6582 C.store( i , j+1UL, xmm3 );
6583 C.store( i+SIMDSIZE, j+1UL, xmm4 );
6584 C.store( i , j+2UL, xmm5 );
6585 C.store( i+SIMDSIZE, j+2UL, xmm6 );
6586 C.store( i , j+3UL, xmm7 );
6587 C.store( i+SIMDSIZE, j+3UL, xmm8 );
6588 }
6589
6590 for( ; (j+3UL) <= jend; j+=3UL )
6591 {
6592 const size_t kbegin( ( IsLower_v<MT5> )
6593 ?( ( IsUpper_v<MT4> )
6594 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6595 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6596 :( IsUpper_v<MT4> ? i : 0UL ) );
6597 const size_t kend( ( IsUpper_v<MT5> )
6598 ?( ( IsLower_v<MT4> )
6599 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
6600 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
6601 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6602
6603 SIMDType xmm1( C.load(i ,j ) );
6604 SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
6605 SIMDType xmm3( C.load(i ,j+1UL) );
6606 SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
6607 SIMDType xmm5( C.load(i ,j+2UL) );
6608 SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
6609
6610 for( size_t k=kbegin; k<kend; ++k ) {
6611 const SIMDType a1( A.load(i ,k) );
6612 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6613 const SIMDType b1( set( B(k,j ) ) );
6614 const SIMDType b2( set( B(k,j+1UL) ) );
6615 const SIMDType b3( set( B(k,j+2UL) ) );
6616 xmm1 -= a1 * b1;
6617 xmm2 -= a2 * b1;
6618 xmm3 -= a1 * b2;
6619 xmm4 -= a2 * b2;
6620 xmm5 -= a1 * b3;
6621 xmm6 -= a2 * b3;
6622 }
6623
6624 C.store( i , j , xmm1 );
6625 C.store( i+SIMDSIZE, j , xmm2 );
6626 C.store( i , j+1UL, xmm3 );
6627 C.store( i+SIMDSIZE, j+1UL, xmm4 );
6628 C.store( i , j+2UL, xmm5 );
6629 C.store( i+SIMDSIZE, j+2UL, xmm6 );
6630 }
6631
6632 for( ; (j+2UL) <= jend; j+=2UL )
6633 {
6634 const size_t kbegin( ( IsLower_v<MT5> )
6635 ?( ( IsUpper_v<MT4> )
6636 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6637 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6638 :( IsUpper_v<MT4> ? i : 0UL ) );
6639 const size_t kend( ( IsUpper_v<MT5> )
6640 ?( ( IsLower_v<MT4> )
6641 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6642 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6643 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6644
6645 SIMDType xmm1( C.load(i ,j ) );
6646 SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
6647 SIMDType xmm3( C.load(i ,j+1UL) );
6648 SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
6649
6650 for( size_t k=kbegin; k<kend; ++k ) {
6651 const SIMDType a1( A.load(i ,k) );
6652 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6653 const SIMDType b1( set( B(k,j ) ) );
6654 const SIMDType b2( set( B(k,j+1UL) ) );
6655 xmm1 -= a1 * b1;
6656 xmm2 -= a2 * b1;
6657 xmm3 -= a1 * b2;
6658 xmm4 -= a2 * b2;
6659 }
6660
6661 C.store( i , j , xmm1 );
6662 C.store( i+SIMDSIZE, j , xmm2 );
6663 C.store( i , j+1UL, xmm3 );
6664 C.store( i+SIMDSIZE, j+1UL, xmm4 );
6665 }
6666
6667 if( j < jend )
6668 {
6669 const size_t kbegin( ( IsLower_v<MT5> )
6670 ?( ( IsUpper_v<MT4> )
6671 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6672 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6673 :( IsUpper_v<MT4> ? i : 0UL ) );
6674 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
6675
6676 SIMDType xmm1( C.load(i ,j) );
6677 SIMDType xmm2( C.load(i+SIMDSIZE,j) );
6678
6679 for( size_t k=kbegin; k<kend; ++k ) {
6680 const SIMDType b1( set( B(k,j) ) );
6681 xmm1 -= A.load(i ,k) * b1;
6682 xmm2 -= A.load(i+SIMDSIZE,k) * b1;
6683 }
6684
6685 C.store( i , j, xmm1 );
6686 C.store( i+SIMDSIZE, j, xmm2 );
6687 }
6688 }
6689
6690 for( ; i<ipos; i+=SIMDSIZE )
6691 {
6692 const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
6693 size_t j( UPP ? i : 0UL );
6694
6695 for( ; (j+4UL) <= jend; j+=4UL )
6696 {
6697 const size_t kbegin( ( IsLower_v<MT5> )
6698 ?( ( IsUpper_v<MT4> )
6699 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6700 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6701 :( IsUpper_v<MT4> ? i : 0UL ) );
6702 const size_t kend( ( IsUpper_v<MT5> )
6703 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
6704 :( K ) );
6705
6706 SIMDType xmm1( C.load(i,j ) );
6707 SIMDType xmm2( C.load(i,j+1UL) );
6708 SIMDType xmm3( C.load(i,j+2UL) );
6709 SIMDType xmm4( C.load(i,j+3UL) );
6710
6711 for( size_t k=kbegin; k<kend; ++k ) {
6712 const SIMDType a1( A.load(i,k) );
6713 xmm1 -= a1 * set( B(k,j ) );
6714 xmm2 -= a1 * set( B(k,j+1UL) );
6715 xmm3 -= a1 * set( B(k,j+2UL) );
6716 xmm4 -= a1 * set( B(k,j+3UL) );
6717 }
6718
6719 C.store( i, j , xmm1 );
6720 C.store( i, j+1UL, xmm2 );
6721 C.store( i, j+2UL, xmm3 );
6722 C.store( i, j+3UL, xmm4 );
6723 }
6724
6725 for( ; (j+3UL) <= jend; j+=3UL )
6726 {
6727 const size_t kbegin( ( IsLower_v<MT5> )
6728 ?( ( IsUpper_v<MT4> )
6729 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6730 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6731 :( IsUpper_v<MT4> ? i : 0UL ) );
6732 const size_t kend( ( IsUpper_v<MT5> )
6733 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
6734 :( K ) );
6735
6736 SIMDType xmm1( C.load(i,j ) );
6737 SIMDType xmm2( C.load(i,j+1UL) );
6738 SIMDType xmm3( C.load(i,j+2UL) );
6739
6740 for( size_t k=kbegin; k<kend; ++k ) {
6741 const SIMDType a1( A.load(i,k) );
6742 xmm1 -= a1 * set( B(k,j ) );
6743 xmm2 -= a1 * set( B(k,j+1UL) );
6744 xmm3 -= a1 * set( B(k,j+2UL) );
6745 }
6746
6747 C.store( i, j , xmm1 );
6748 C.store( i, j+1UL, xmm2 );
6749 C.store( i, j+2UL, xmm3 );
6750 }
6751
6752 for( ; (j+2UL) <= jend; j+=2UL )
6753 {
6754 const size_t kbegin( ( IsLower_v<MT5> )
6755 ?( ( IsUpper_v<MT4> )
6756 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6757 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6758 :( IsUpper_v<MT4> ? i : 0UL ) );
6759 const size_t kend( ( IsUpper_v<MT5> )
6760 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6761 :( K ) );
6762
6763 SIMDType xmm1( C.load(i,j ) );
6764 SIMDType xmm2( C.load(i,j+1UL) );
6765
6766 for( size_t k=kbegin; k<kend; ++k ) {
6767 const SIMDType a1( A.load(i,k) );
6768 xmm1 -= a1 * set( B(k,j ) );
6769 xmm2 -= a1 * set( B(k,j+1UL) );
6770 }
6771
6772 C.store( i, j , xmm1 );
6773 C.store( i, j+1UL, xmm2 );
6774 }
6775
6776 if( j < jend )
6777 {
6778 const size_t kbegin( ( IsLower_v<MT5> )
6779 ?( ( IsUpper_v<MT4> )
6780 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6781 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6782 :( IsUpper_v<MT4> ? i : 0UL ) );
6783
6784 SIMDType xmm1( C.load(i,j) );
6785
6786 for( size_t k=kbegin; k<K; ++k ) {
6787 xmm1 -= A.load(i,k) * set( B(k,j) );
6788 }
6789
6790 C.store( i, j, xmm1 );
6791 }
6792 }
6793
6794 for( ; remainder && i<M; ++i )
6795 {
6796 const size_t jend( LOW ? i+1UL : N );
6797 size_t j( UPP ? i : 0UL );
6798
6799 for( ; (j+2UL) <= jend; j+=2UL )
6800 {
6801 const size_t kbegin( ( IsLower_v<MT5> )
6802 ?( ( IsUpper_v<MT4> )
6803 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6804 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6805 :( IsUpper_v<MT4> ? i : 0UL ) );
6806 const size_t kend( ( IsUpper_v<MT5> )
6807 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6808 :( K ) );
6809
6810 ElementType value1( C(i,j ) );
6811 ElementType value2( C(i,j+1UL) );
6812
6813 for( size_t k=kbegin; k<kend; ++k ) {
6814 value1 -= A(i,k) * B(k,j );
6815 value2 -= A(i,k) * B(k,j+1UL);
6816 }
6817
6818 C(i,j ) = value1;
6819 C(i,j+1UL) = value2;
6820 }
6821
6822 if( j < jend )
6823 {
6824 const size_t kbegin( ( IsLower_v<MT5> )
6825 ?( ( IsUpper_v<MT4> )
6826 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6827 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6828 :( IsUpper_v<MT4> ? i : 0UL ) );
6829
6830 ElementType value( C(i,j) );
6831
6832 for( size_t k=kbegin; k<K; ++k ) {
6833 value -= A(i,k) * B(k,j);
6834 }
6835
6836 C(i,j) = value;
6837 }
6838 }
6839 }
6841 //**********************************************************************************************
6842
6843 //**Default subtraction assignment to dense matrices (large matrices)***************************
6857 template< typename MT3 // Type of the left-hand side target matrix
6858 , typename MT4 // Type of the left-hand side matrix operand
6859 , typename MT5 > // Type of the right-hand side matrix operand
6860 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6861 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6862 {
6863 selectDefaultSubAssignKernel( C, A, B );
6864 }
6866 //**********************************************************************************************
6867
6868 //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
6883 template< typename MT3 // Type of the left-hand side target matrix
6884 , typename MT4 // Type of the left-hand side matrix operand
6885 , typename MT5 > // Type of the right-hand side matrix operand
6886 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6887 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6888 {
6889 if( LOW )
6890 lmmm( C, A, B, ElementType(-1), ElementType(1) );
6891 else if( UPP )
6892 ummm( C, A, B, ElementType(-1), ElementType(1) );
6893 else
6894 mmm( C, A, B, ElementType(-1), ElementType(1) );
6895 }
6897 //**********************************************************************************************
6898
6899 //**BLAS-based subtraction assignment to dense matrices (default)*******************************
6913 template< typename MT3 // Type of the left-hand side target matrix
6914 , typename MT4 // Type of the left-hand side matrix operand
6915 , typename MT5 > // Type of the right-hand side matrix operand
6916 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6917 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6918 {
6919 selectLargeSubAssignKernel( C, A, B );
6920 }
6922 //**********************************************************************************************
6923
6924 //**BLAS-based subraction assignment to dense matrices******************************************
6925#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6939 template< typename MT3 // Type of the left-hand side target matrix
6940 , typename MT4 // Type of the left-hand side matrix operand
6941 , typename MT5 > // Type of the right-hand side matrix operand
6942 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6943 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6944 {
6945 using ET = ElementType_t<MT3>;
6946
6947 if( IsTriangular_v<MT4> ) {
6948 ResultType_t<MT3> tmp( serial( B ) );
6949 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
6950 subAssign( C, tmp );
6951 }
6952 else if( IsTriangular_v<MT5> ) {
6953 ResultType_t<MT3> tmp( serial( A ) );
6954 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
6955 subAssign( C, tmp );
6956 }
6957 else {
6958 gemm( C, A, B, ET(-1), ET(1) );
6959 }
6960 }
6962#endif
6963 //**********************************************************************************************
6964
6965 //**Subtraction assignment to sparse matrices***************************************************
6966 // No special implementation for the subtraction assignment to sparse matrices.
6967 //**********************************************************************************************
6968
6969 //**Schur product assignment to dense matrices**************************************************
6982 template< typename MT // Type of the target dense matrix
6983 , bool SO > // Storage order of the target dense matrix
6984 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6985 {
6987
6991
6992 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6993 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6994
6995 const ResultType tmp( serial( rhs ) );
6996 schurAssign( *lhs, tmp );
6997 }
6999 //**********************************************************************************************
7000
7001 //**Schur product assignment to sparse matrices*************************************************
7002 // No special implementation for the Schur product assignment to sparse matrices.
7003 //**********************************************************************************************
7004
7005 //**Multiplication assignment to dense matrices*************************************************
7006 // No special implementation for the multiplication assignment to dense matrices.
7007 //**********************************************************************************************
7008
7009 //**Multiplication assignment to sparse matrices************************************************
7010 // No special implementation for the multiplication assignment to sparse matrices.
7011 //**********************************************************************************************
7012
7013 //**SMP assignment to dense matrices************************************************************
7029 template< typename MT // Type of the target dense matrix
7030 , bool SO > // Storage order of the target dense matrix
7031 friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
7032 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
7033 {
7035
7036 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
7037 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
7038
7039 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
7040 return;
7041 }
7042 else if( rhs.lhs_.columns() == 0UL ) {
7043 reset( *lhs );
7044 return;
7045 }
7046
7047 LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
7048 RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
7049
7050 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
7051 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
7052 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
7053 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
7054 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
7055 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
7056
7057 smpAssign( *lhs, A * B );
7058 }
7060 //**********************************************************************************************
7061
7062 //**SMP assignment to sparse matrices***********************************************************
7078 template< typename MT // Type of the target sparse matrix
7079 , bool SO > // Storage order of the target sparse matrix
7080 friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
7081 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
7082 {
7084
7085 using TmpType = If_t< SO, ResultType, OppositeType >;
7086
7093
7094 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
7095 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
7096
7097 const ForwardFunctor fwd;
7098
7099 const TmpType tmp( rhs );
7100 smpAssign( *lhs, fwd( tmp ) );
7101 }
7103 //**********************************************************************************************
7104
7105 //**SMP addition assignment to dense matrices***************************************************
7121 template< typename MT // Type of the target dense matrix
7122 , bool SO > // Storage order of the target dense matrix
7123 friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
7124 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
7125 {
7127
7128 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
7129 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
7130
7131 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
7132 return;
7133 }
7134
7135 LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
7136 RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
7137
7138 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
7139 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
7140 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
7141 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
7142 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
7143 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
7144
7145 smpAddAssign( *lhs, A * B );
7146 }
7148 //**********************************************************************************************
7149
7150 //**SMP addition assignment to sparse matrices**************************************************
7151 // No special implementation for the SMP addition assignment to sparse matrices.
7152 //**********************************************************************************************
7153
7154 //**SMP subtraction assignment to dense matrices************************************************
7170 template< typename MT // Type of the target dense matrix
7171 , bool SO > // Storage order of the target dense matrix
7172 friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
7173 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
7174 {
7176
7177 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
7178 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
7179
7180 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
7181 return;
7182 }
7183
7184 LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
7185 RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
7186
7187 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
7188 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
7189 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
7190 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
7191 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
7192 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
7193
7194 smpSubAssign( *lhs, A * B );
7195 }
7197 //**********************************************************************************************
7198
7199 //**SMP subtraction assignment to sparse matrices***********************************************
7200 // No special implementation for the SMP subtraction assignment to sparse matrices.
7201 //**********************************************************************************************
7202
7203 //**SMP Schur product assignment to dense matrices**********************************************
7216 template< typename MT // Type of the target dense matrix
7217 , bool SO > // Storage order of the target dense matrix
7218 friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
7219 {
7221
7225
7226 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
7227 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
7228
7229 const ResultType tmp( rhs );
7230 smpSchurAssign( *lhs, tmp );
7231 }
7233 //**********************************************************************************************
7234
7235 //**SMP Schur product assignment to sparse matrices*********************************************
7236 // No special implementation for the SMP Schur product assignment to sparse matrices.
7237 //**********************************************************************************************
7238
7239 //**SMP multiplication assignment to dense matrices*********************************************
7240 // No special implementation for the SMP multiplication assignment to dense matrices.
7241 //**********************************************************************************************
7242
7243 //**SMP multiplication assignment to sparse matrices********************************************
7244 // No special implementation for the SMP multiplication assignment to sparse matrices.
7245 //**********************************************************************************************
7246
7247 //**Compile time checks*************************************************************************
7255 //**********************************************************************************************
7256};
7257//*************************************************************************************************
7258
7259
7260
7261
7262//=================================================================================================
7263//
7264// DMATSCALARMULTEXPR SPECIALIZATION
7265//
7266//=================================================================================================
7267
7268//*************************************************************************************************
7276template< typename MT1 // Type of the left-hand side dense matrix
7277 , typename MT2 // Type of the right-hand side dense matrix
7278 , bool SF // Symmetry flag
7279 , bool HF // Hermitian flag
7280 , bool LF // Lower flag
7281 , bool UF // Upper flag
7282 , typename ST > // Type of the right-hand side scalar value
7283class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
7284 : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
7285 , private Computation
7286{
7287 private:
7288 //**Type definitions****************************************************************************
7290 using MMM = TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
7291
7292 using RES = ResultType_t<MMM>;
7293 using RT1 = ResultType_t<MT1>;
7294 using RT2 = ResultType_t<MT2>;
7295 using ET1 = ElementType_t<RT1>;
7296 using ET2 = ElementType_t<RT2>;
7297 using CT1 = CompositeType_t<MT1>;
7298 using CT2 = CompositeType_t<MT2>;
7299 //**********************************************************************************************
7300
7301 //**********************************************************************************************
7303 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
7304 //**********************************************************************************************
7305
7306 //**********************************************************************************************
7308 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
7309 //**********************************************************************************************
7310
7311 //**********************************************************************************************
7312 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
7313 static constexpr bool HERM = ( HF && !( LF || UF ) );
7314 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
7315 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
7316 //**********************************************************************************************
7317
7318 //**********************************************************************************************
7320
7323 template< typename T1, typename T2, typename T3 >
7324 static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
7325 //**********************************************************************************************
7326
7327 //**********************************************************************************************
7329
7331 template< typename T1, typename T2, typename T3, typename T4 >
7332 static constexpr bool UseBlasKernel_v =
7333 ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
7334 !SYM && !HERM && !LOW && !UPP &&
7335 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
7336 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
7337 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
7338 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
7339 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
7340 IsBLASCompatible_v< ElementType_t<T1> > &&
7341 IsBLASCompatible_v< ElementType_t<T2> > &&
7342 IsBLASCompatible_v< ElementType_t<T3> > &&
7343 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
7344 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
7345 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
7346 //**********************************************************************************************
7347
7348 //**********************************************************************************************
7350
7352 template< typename T1, typename T2, typename T3, typename T4 >
7353 static constexpr bool UseVectorizedDefaultKernel_v =
7354 ( useOptimizedKernels &&
7355 !( IsDiagonal_v<T2> && IsDiagonal_v<T3> ) &&
7356 !( IsDiagonal_v<T2> && IsColumnMajorMatrix_v<T1> ) &&
7357 !( IsDiagonal_v<T3> && IsRowMajorMatrix_v<T1> ) &&
7358 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
7359 IsSIMDCombinable_v< ElementType_t<T1>
7360 , ElementType_t<T2>
7361 , ElementType_t<T3>
7362 , T4 > &&
7363 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
7364 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
7365 //**********************************************************************************************
7366
7367 //**********************************************************************************************
7369
7371 using ForwardFunctor = If_t< HERM
7372 , DeclHerm
7373 , If_t< SYM
7374 , DeclSym
7375 , If_t< LOW
7376 , If_t< UPP
7377 , DeclDiag
7378 , DeclLow >
7379 , If_t< UPP
7380 , DeclUpp
7381 , Noop > > > >;
7382 //**********************************************************************************************
7383
7384 public:
7385 //**Type definitions****************************************************************************
7387 using This = DMatScalarMultExpr<MMM,ST,true>;
7388
7390 using BaseType = MatScalarMultExpr< DenseMatrix<This,true> >;
7391
7393 using ResultType = typename If_t< HERM
7394 , DeclHermTrait< MultTrait_t<RES,ST> >
7395 , If_t< SYM
7396 , DeclSymTrait< MultTrait_t<RES,ST> >
7397 , If_t< LOW
7398 , If_t< UPP
7399 , DeclDiagTrait< MultTrait_t<RES,ST> >
7400 , DeclLowTrait< MultTrait_t<RES,ST> > >
7401 , If_t< UPP
7402 , DeclUppTrait< MultTrait_t<RES,ST> >
7403 , MultTrait<RES,ST> > > > >::Type;
7404
7405 using OppositeType = OppositeType_t<ResultType>;
7406 using TransposeType = TransposeType_t<ResultType>;
7407 using ElementType = ElementType_t<ResultType>;
7408 using SIMDType = SIMDTrait_t<ElementType>;
7409 using ReturnType = const ElementType;
7410 using CompositeType = const ResultType;
7411
7413 using LeftOperand = const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
7414
7416 using RightOperand = ST;
7417
7419 using LT = If_t< evaluateLeft, const RT1, CT1 >;
7420
7422 using RT = If_t< evaluateRight, const RT2, CT2 >;
7423 //**********************************************************************************************
7424
7425 //**Compilation flags***************************************************************************
7427 static constexpr bool simdEnabled =
7428 ( !( IsDiagonal_v<MT1> && IsDiagonal_v<MT2> ) &&
7429 MT1::simdEnabled && MT2::simdEnabled &&
7430 IsSIMDCombinable_v<ET1,ET2,ST> &&
7431 HasSIMDAdd_v<ET1,ET2> &&
7432 HasSIMDMult_v<ET1,ET2> );
7433
7435 static constexpr bool smpAssignable =
7436 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
7437 //**********************************************************************************************
7438
7439 //**SIMD properties*****************************************************************************
7441 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
7442 //**********************************************************************************************
7443
7444 //**Constructor*********************************************************************************
7450 inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
7451 : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
7452 , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
7453 {}
7454 //**********************************************************************************************
7455
7456 //**Access operator*****************************************************************************
7463 inline ReturnType operator()( size_t i, size_t j ) const {
7464 BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
7465 BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
7466 return matrix_(i,j) * scalar_;
7467 }
7468 //**********************************************************************************************
7469
7470 //**At function*********************************************************************************
7478 inline ReturnType at( size_t i, size_t j ) const {
7479 if( i >= matrix_.rows() ) {
7480 BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
7481 }
7482 if( j >= matrix_.columns() ) {
7483 BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
7484 }
7485 return (*this)(i,j);
7486 }
7487 //**********************************************************************************************
7488
7489 //**Rows function*******************************************************************************
7494 inline size_t rows() const {
7495 return matrix_.rows();
7496 }
7497 //**********************************************************************************************
7498
7499 //**Columns function****************************************************************************
7504 inline size_t columns() const {
7505 return matrix_.columns();
7506 }
7507 //**********************************************************************************************
7508
7509 //**Left operand access*************************************************************************
7514 inline LeftOperand leftOperand() const {
7515 return matrix_;
7516 }
7517 //**********************************************************************************************
7518
7519 //**Right operand access************************************************************************
7524 inline RightOperand rightOperand() const {
7525 return scalar_;
7526 }
7527 //**********************************************************************************************
7528
7529 //**********************************************************************************************
7535 template< typename T >
7536 inline bool canAlias( const T* alias ) const {
7537 return matrix_.canAlias( alias );
7538 }
7539 //**********************************************************************************************
7540
7541 //**********************************************************************************************
7547 template< typename T >
7548 inline bool isAliased( const T* alias ) const {
7549 return matrix_.isAliased( alias );
7550 }
7551 //**********************************************************************************************
7552
7553 //**********************************************************************************************
7558 inline bool isAligned() const {
7559 return matrix_.isAligned();
7560 }
7561 //**********************************************************************************************
7562
7563 //**********************************************************************************************
7568 inline bool canSMPAssign() const noexcept {
7569 return ( !BLAZE_BLAS_MODE ||
7570 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
7572 ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
7573 ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD );
7574 }
7575 //**********************************************************************************************
7576
7577 private:
7578 //**Member variables****************************************************************************
7581 //**********************************************************************************************
7582
7583 //**Assignment to dense matrices****************************************************************
7595 template< typename MT // Type of the target dense matrix
7596 , bool SO > // Storage order of the target dense matrix
7597 friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7598 {
7600
7601 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
7602 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
7603
7604 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7605 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7606
7607 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
7608 return;
7609 }
7610 else if( left.columns() == 0UL ) {
7611 reset( *lhs );
7612 return;
7613 }
7614
7615 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7616 RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7617
7618 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7619 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7620 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7621 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7622 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
7623 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
7624
7625 DMatScalarMultExpr::selectAssignKernel( *lhs, A, B, rhs.scalar_ );
7626 }
7627 //**********************************************************************************************
7628
7629 //**Assignment to dense matrices (kernel selection)*********************************************
7640 template< typename MT3 // Type of the left-hand side target matrix
7641 , typename MT4 // Type of the left-hand side matrix operand
7642 , typename MT5 // Type of the right-hand side matrix operand
7643 , typename ST2 > // Type of the scalar value
7644 static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7645 {
7646 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
7647 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
7648 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
7649 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7650 selectSmallAssignKernel( C, A, B, scalar );
7651 else
7652 selectBlasAssignKernel( C, A, B, scalar );
7653 }
7654 //**********************************************************************************************
7655
7656 //**Default assignment to row-major dense matrices (general/general)****************************
7670 template< typename MT3 // Type of the left-hand side target matrix
7671 , typename MT4 // Type of the left-hand side matrix operand
7672 , typename MT5 // Type of the right-hand side matrix operand
7673 , typename ST2 > // Type of the scalar value
7674 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7675 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7676 {
7677 const size_t M( A.rows() );
7678 const size_t N( B.columns() );
7679 const size_t K( A.columns() );
7680
7681 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7682
7683 for( size_t i=0UL; i<M; ++i )
7684 {
7685 const size_t kbegin( ( IsUpper_v<MT4> )
7686 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7687 :( 0UL ) );
7688 const size_t kend( ( IsLower_v<MT4> )
7689 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7690 :( K ) );
7691 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
7692
7693 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
7694 for( size_t j=0UL; j<N; ++j ) {
7695 reset( C(i,j) );
7696 }
7697 continue;
7698 }
7699
7700 {
7701 const size_t jbegin( ( IsUpper_v<MT5> )
7702 ?( ( IsStrictlyUpper_v<MT5> )
7703 ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
7704 :( UPP ? max(i,kbegin) : kbegin ) )
7705 :( UPP ? i : 0UL ) );
7706 const size_t jend( ( IsLower_v<MT5> )
7707 ?( ( IsStrictlyLower_v<MT5> )
7708 ?( LOW ? min(i+1UL,kbegin) : kbegin )
7709 :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
7710 :( LOW ? i+1UL : N ) );
7711
7712 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
7713 for( size_t j=0UL; j<jbegin; ++j ) {
7714 reset( C(i,j) );
7715 }
7716 }
7717 else if( IsStrictlyUpper_v<MT5> ) {
7718 reset( C(i,0UL) );
7719 }
7720 for( size_t j=jbegin; j<jend; ++j ) {
7721 C(i,j) = A(i,kbegin) * B(kbegin,j);
7722 }
7723 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
7724 for( size_t j=jend; j<N; ++j ) {
7725 reset( C(i,j) );
7726 }
7727 }
7728 else if( IsStrictlyLower_v<MT5> ) {
7729 reset( C(i,N-1UL) );
7730 }
7731 }
7732
7733 for( size_t k=kbegin+1UL; k<kend; ++k )
7734 {
7735 const size_t jbegin( ( IsUpper_v<MT5> )
7736 ?( ( IsStrictlyUpper_v<MT5> )
7737 ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
7738 :( SYM || HERM || UPP ? max( i, k ) : k ) )
7739 :( SYM || HERM || UPP ? i : 0UL ) );
7740 const size_t jend( ( IsLower_v<MT5> )
7741 ?( ( IsStrictlyLower_v<MT5> )
7742 ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
7743 :( LOW ? min(i+1UL,k) : k ) )
7744 :( LOW ? i+1UL : N ) );
7745
7746 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
7747 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7748
7749 for( size_t j=jbegin; j<jend; ++j ) {
7750 C(i,j) += A(i,k) * B(k,j);
7751 }
7752 if( IsLower_v<MT5> ) {
7753 C(i,jend) = A(i,k) * B(k,jend);
7754 }
7755 }
7756
7757 {
7758 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
7759 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? i+1UL : i )
7760 :( SYM || HERM || UPP ? i : 0UL ) );
7761 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
7762 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? i : i+1UL )
7763 :( LOW ? i+1UL : N ) );
7764
7765 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
7766 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7767
7768 for( size_t j=jbegin; j<jend; ++j ) {
7769 C(i,j) *= scalar;
7770 }
7771 }
7772 }
7773
7774 if( SYM || HERM ) {
7775 for( size_t i=1UL; i<M; ++i ) {
7776 for( size_t j=0UL; j<i; ++j ) {
7777 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
7778 }
7779 }
7780 }
7781 }
7782 //**********************************************************************************************
7783
7784 //**Default assignment to column-major dense matrices (general/general)*************************
7798 template< typename MT3 // Type of the left-hand side target matrix
7799 , typename MT4 // Type of the left-hand side matrix operand
7800 , typename MT5 // Type of the right-hand side matrix operand
7801 , typename ST2 > // Type of the scalar value
7802 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7803 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7804 {
7805 const size_t M( A.rows() );
7806 const size_t N( B.columns() );
7807 const size_t K( A.columns() );
7808
7809 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7810
7811 for( size_t j=0UL; j<N; ++j )
7812 {
7813 const size_t kbegin( ( IsLower_v<MT5> )
7814 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7815 :( 0UL ) );
7816 const size_t kend( ( IsUpper_v<MT5> )
7817 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7818 :( K ) );
7819 BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
7820
7821 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
7822 for( size_t i=0UL; i<M; ++i ) {
7823 reset( C(i,j) );
7824 }
7825 continue;
7826 }
7827
7828 {
7829 const size_t ibegin( ( IsLower_v<MT4> )
7830 ?( ( IsStrictlyLower_v<MT4> )
7831 ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
7832 :( LOW ? max(j,kbegin) : kbegin ) )
7833 :( LOW ? j : 0UL ) );
7834 const size_t iend( ( IsUpper_v<MT4> )
7835 ?( ( IsStrictlyUpper_v<MT4> )
7836 ?( UPP ? min(j+1UL,kbegin) : kbegin )
7837 :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
7838 :( UPP ? j+1UL : M ) );
7839
7840 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
7841 for( size_t i=0UL; i<ibegin; ++i ) {
7842 reset( C(i,j) );
7843 }
7844 }
7845 else if( IsStrictlyLower_v<MT4> ) {
7846 reset( C(0UL,j) );
7847 }
7848 for( size_t i=ibegin; i<iend; ++i ) {
7849 C(i,j) = A(i,kbegin) * B(kbegin,j);
7850 }
7851 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
7852 for( size_t i=iend; i<M; ++i ) {
7853 reset( C(i,j) );
7854 }
7855 }
7856 else if( IsStrictlyUpper_v<MT4> ) {
7857 reset( C(M-1UL,j) );
7858 }
7859 }
7860
7861 for( size_t k=kbegin+1UL; k<kend; ++k )
7862 {
7863 const size_t ibegin( ( IsLower_v<MT4> )
7864 ?( ( IsStrictlyLower_v<MT4> )
7865 ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
7866 :( SYM || HERM || LOW ? max( j, k ) : k ) )
7867 :( SYM || HERM || LOW ? j : 0UL ) );
7868 const size_t iend( ( IsUpper_v<MT4> )
7869 ?( ( IsStrictlyUpper_v<MT4> )
7870 ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
7871 :( UPP ? min(j+1UL,k) : k ) )
7872 :( UPP ? j+1UL : M ) );
7873
7874 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
7875 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7876
7877 for( size_t i=ibegin; i<iend; ++i ) {
7878 C(i,j) += A(i,k) * B(k,j);
7879 }
7880 if( IsUpper_v<MT4> ) {
7881 C(iend,j) = A(iend,k) * B(k,j);
7882 }
7883 }
7884
7885 {
7886 const size_t ibegin( ( ( IsLower_v<MT4> && IsLower_v<MT5> ) )
7887 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? j+1UL : j )
7888 :( SYM || HERM || LOW ? j : 0UL ) );
7889 const size_t iend( ( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) )
7890 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? j : j+1UL )
7891 :( UPP ? j+1UL : M ) );
7892
7893 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
7894 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7895
7896 for( size_t i=ibegin; i<iend; ++i ) {
7897 C(i,j) *= scalar;
7898 }
7899 }
7900 }
7901
7902 if( SYM || HERM ) {
7903 for( size_t j=1UL; j<N; ++j ) {
7904 for( size_t i=0UL; i<j; ++i ) {
7905 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
7906 }
7907 }
7908 }
7909 }
7910 //**********************************************************************************************
7911
7912 //**Default assignment to row-major dense matrices (general/diagonal)***************************
7926 template< typename MT3 // Type of the left-hand side target matrix
7927 , typename MT4 // Type of the left-hand side matrix operand
7928 , typename MT5 // Type of the right-hand side matrix operand
7929 , typename ST2 > // Type of the scalar value
7930 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7931 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7932 {
7933 constexpr size_t block( BLOCK_SIZE );
7934
7935 const size_t M( A.rows() );
7936 const size_t N( B.columns() );
7937
7938 for( size_t ii=0UL; ii<M; ii+=block ) {
7939 const size_t iend( min( M, ii+block ) );
7940 for( size_t jj=0UL; jj<N; jj+=block ) {
7941 const size_t jend( min( N, jj+block ) );
7942 for( size_t i=ii; i<iend; ++i )
7943 {
7944 const size_t jbegin( ( IsUpper_v<MT4> )
7945 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
7946 :( jj ) );
7947 const size_t jpos( ( IsLower_v<MT4> )
7948 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
7949 :( jend ) );
7950
7951 if( IsUpper_v<MT4> ) {
7952 for( size_t j=jj; j<jbegin; ++j ) {
7953 reset( C(i,j) );
7954 }
7955 }
7956 for( size_t j=jbegin; j<jpos; ++j ) {
7957 C(i,j) = A(i,j) * B(j,j) * scalar;
7958 }
7959 if( IsLower_v<MT4> ) {
7960 for( size_t j=jpos; j<jend; ++j ) {
7961 reset( C(i,j) );
7962 }
7963 }
7964 }
7965 }
7966 }
7967 }
7968 //**********************************************************************************************
7969
7970 //**Default assignment to column-major dense matrices (general/diagonal)************************
7984 template< typename MT3 // Type of the left-hand side target matrix
7985 , typename MT4 // Type of the left-hand side matrix operand
7986 , typename MT5 // Type of the right-hand side matrix operand
7987 , typename ST2 > // Type of the scalar value
7988 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7989 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7990 {
7991 const size_t M( A.rows() );
7992 const size_t N( B.columns() );
7993
7994 for( size_t j=0UL; j<N; ++j )
7995 {
7996 const size_t ibegin( ( IsLower_v<MT4> )
7997 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
7998 :( 0UL ) );
7999 const size_t iend( ( IsUpper_v<MT4> )
8000 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
8001 :( M ) );
8002 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
8003
8004 if( IsLower_v<MT4> ) {
8005 for( size_t i=0UL; i<ibegin; ++i ) {
8006 reset( C(i,j) );
8007 }
8008 }
8009 for( size_t i=ibegin; i<iend; ++i ) {
8010 C(i,j) = A(i,j) * B(j,j) * scalar;
8011 }
8012 if( IsUpper_v<MT4> ) {
8013 for( size_t i=iend; i<M; ++i ) {
8014 reset( C(i,j) );
8015 }
8016 }
8017 }
8018 }
8019 //**********************************************************************************************
8020
8021 //**Default assignment to row-major dense matrices (diagonal/general)***************************
8035 template< typename MT3 // Type of the left-hand side target matrix
8036 , typename MT4 // Type of the left-hand side matrix operand
8037 , typename MT5 // Type of the right-hand side matrix operand
8038 , typename ST2 > // Type of the scalar value
8039 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8040 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
8041 {
8042 const size_t M( A.rows() );
8043 const size_t N( B.columns() );
8044
8045 for( size_t i=0UL; i<M; ++i )
8046 {
8047 const size_t jbegin( ( IsUpper_v<MT5> )
8048 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
8049 :( 0UL ) );
8050 const size_t jend( ( IsLower_v<MT5> )
8051 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
8052 :( N ) );
8053 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
8054
8055 if( IsUpper_v<MT5> ) {
8056 for( size_t j=0UL; j<jbegin; ++j ) {
8057 reset( C(i,j) );
8058 }
8059 }
8060 for( size_t j=jbegin; j<jend; ++j ) {
8061 C(i,j) = A(i,i) * B(i,j) * scalar;
8062 }
8063 if( IsLower_v<MT5> ) {
8064 for( size_t j=jend; j<N; ++j ) {
8065 reset( C(i,j) );
8066 }
8067 }
8068 }
8069 }
8070 //**********************************************************************************************
8071
8072 //**Default assignment to column-major dense matrices (diagonal/general)************************
8086 template< typename MT3 // Type of the left-hand side target matrix
8087 , typename MT4 // Type of the left-hand side matrix operand
8088 , typename MT5 // Type of the right-hand side matrix operand
8089 , typename ST2 > // Type of the scalar value
8090 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8091 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
8092 {
8093 constexpr size_t block( BLOCK_SIZE );
8094
8095 const size_t M( A.rows() );
8096 const size_t N( B.columns() );
8097
8098 for( size_t jj=0UL; jj<N; jj+=block ) {
8099 const size_t jend( min( N, jj+block ) );
8100 for( size_t ii=0UL; ii<M; ii+=block ) {
8101 const size_t iend( min( M, ii+block ) );
8102 for( size_t j=jj; j<jend; ++j )
8103 {
8104 const size_t ibegin( ( IsLower_v<MT5> )
8105 ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
8106 :( ii ) );
8107 const size_t ipos( ( IsUpper_v<MT5> )
8108 ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
8109 :( iend ) );
8110
8111 if( IsLower_v<MT5> ) {
8112 for( size_t i=ii; i<ibegin; ++i ) {
8113 reset( C(i,j) );
8114 }
8115 }
8116 for( size_t i=ibegin; i<ipos; ++i ) {
8117 C(i,j) = A(i,i) * B(i,j) * scalar;
8118 }
8119 if( IsUpper_v<MT5> ) {
8120 for( size_t i=ipos; i<iend; ++i ) {
8121 reset( C(i,j) );
8122 }
8123 }
8124 }
8125 }
8126 }
8127 }
8128 //**********************************************************************************************
8129
8130 //**Default assignment to dense matrices (diagonal/diagonal)************************************
8144 template< typename MT3 // Type of the left-hand side target matrix
8145 , typename MT4 // Type of the left-hand side matrix operand
8146 , typename MT5 // Type of the right-hand side matrix operand
8147 , typename ST2 > // Type of the scalar value
8148 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8149 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
8150 {
8151 reset( C );
8152
8153 for( size_t i=0UL; i<A.rows(); ++i ) {
8154 C(i,i) = A(i,i) * B(i,i) * scalar;
8155 }
8156 }
8157 //**********************************************************************************************
8158
8159 //**Default assignment to dense matrices (small matrices)***************************************
8173 template< typename MT3 // Type of the left-hand side target matrix
8174 , typename MT4 // Type of the left-hand side matrix operand
8175 , typename MT5 // Type of the right-hand side matrix operand
8176 , typename ST2 > // Type of the scalar value
8177 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8178 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8179 {
8180 selectDefaultAssignKernel( C, A, B, scalar );
8181 }
8182 //**********************************************************************************************
8183
8184 //**Vectorized default assignment to row-major dense matrices (small matrices)******************
8199 template< typename MT3 // Type of the left-hand side target matrix
8200 , typename MT4 // Type of the left-hand side matrix operand
8201 , typename MT5 // Type of the right-hand side matrix operand
8202 , typename ST2 > // Type of the scalar value
8203 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8204 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8205 {
8206 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
8207
8208 const size_t M( A.rows() );
8209 const size_t N( B.columns() );
8210 const size_t K( A.columns() );
8211
8212 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
8213
8214 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
8215 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
8216
8217 const SIMDType factor( set( scalar ) );
8218
8219 size_t j( 0UL );
8220
8221 if( IsIntegral_v<ElementType> )
8222 {
8223 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
8224 for( size_t i=0UL; i<M; ++i )
8225 {
8226 const size_t kbegin( ( IsUpper_v<MT4> )
8227 ?( ( IsLower_v<MT5> )
8228 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8229 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8230 :( IsLower_v<MT5> ? j : 0UL ) );
8231 const size_t kend( ( IsLower_v<MT4> )
8232 ?( ( IsUpper_v<MT5> )
8233 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
8234 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
8235 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
8236
8237 size_t k( kbegin );
8238
8239 if( k < kend )
8240 {
8241 SIMDType a1( set( A(i,k) ) );
8242 SIMDType xmm1( a1 * B.load(k,j ) );
8243 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
8244 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
8245 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
8246 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
8247 SIMDType xmm6( a1 * B.load(k,j+SIMDSIZE*5UL) );
8248 SIMDType xmm7( a1 * B.load(k,j+SIMDSIZE*6UL) );
8249 SIMDType xmm8( a1 * B.load(k,j+SIMDSIZE*7UL) );
8250
8251 for( ++k; k<kend; ++k ) {
8252 a1 = set( A(i,k) );
8253 xmm1 += a1 * B.load(k,j );
8254 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8255 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8256 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8257 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
8258 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
8259 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
8260 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
8261 }
8262
8263 C.store( i, j , xmm1 * factor );
8264 C.store( i, j+SIMDSIZE , xmm2 * factor );
8265 C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8266 C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
8267 C.store( i, j+SIMDSIZE*4UL, xmm5 * factor );
8268 C.store( i, j+SIMDSIZE*5UL, xmm6 * factor );
8269 C.store( i, j+SIMDSIZE*6UL, xmm7 * factor );
8270 C.store( i, j+SIMDSIZE*7UL, xmm8 * factor );
8271 }
8272 else
8273 {
8274 const SIMDType zero;
8275 C.store( i, j , zero );
8276 C.store( i, j+SIMDSIZE , zero );
8277 C.store( i, j+SIMDSIZE*2UL, zero );
8278 C.store( i, j+SIMDSIZE*3UL, zero );
8279 C.store( i, j+SIMDSIZE*4UL, zero );
8280 C.store( i, j+SIMDSIZE*5UL, zero );
8281 C.store( i, j+SIMDSIZE*6UL, zero );
8282 C.store( i, j+SIMDSIZE*7UL, zero );
8283 }
8284 }
8285 }
8286 }
8287
8288 for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
8289 {
8290 size_t i( 0UL );
8291
8292 for( ; (i+2UL) <= M; i+=2UL )
8293 {
8294 const size_t kbegin( ( IsUpper_v<MT4> )
8295 ?( ( IsLower_v<MT5> )
8296 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8297 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8298 :( IsLower_v<MT5> ? j : 0UL ) );
8299 const size_t kend( ( IsLower_v<MT4> )
8300 ?( ( IsUpper_v<MT5> )
8301 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
8302 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8303 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
8304
8305 size_t k( kbegin );
8306
8307 if( k < kend )
8308 {
8309 SIMDType a1( set( A(i ,k) ) );
8310 SIMDType a2( set( A(i+1UL,k) ) );
8311 SIMDType b1( B.load(k,j ) );
8312 SIMDType b2( B.load(k,j+SIMDSIZE ) );
8313 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8314 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
8315 SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
8316 SIMDType xmm1 ( a1 * b1 );
8317 SIMDType xmm2 ( a1 * b2 );
8318 SIMDType xmm3 ( a1 * b3 );
8319 SIMDType xmm4 ( a1 * b4 );
8320 SIMDType xmm5 ( a1 * b5 );
8321 SIMDType xmm6 ( a2 * b1 );
8322 SIMDType xmm7 ( a2 * b2 );
8323 SIMDType xmm8 ( a2 * b3 );
8324 SIMDType xmm9 ( a2 * b4 );
8325 SIMDType xmm10( a2 * b5 );
8326
8327 for( ++k; k<kend; ++k ) {
8328 a1 = set( A(i ,k) );
8329 a2 = set( A(i+1UL,k) );
8330 b1 = B.load(k,j );
8331 b2 = B.load(k,j+SIMDSIZE );
8332 b3 = B.load(k,j+SIMDSIZE*2UL);
8333 b4 = B.load(k,j+SIMDSIZE*3UL);
8334 b5 = B.load(k,j+SIMDSIZE*4UL);
8335 xmm1 += a1 * b1;
8336 xmm2 += a1 * b2;
8337 xmm3 += a1 * b3;
8338 xmm4 += a1 * b4;
8339 xmm5 += a1 * b5;
8340 xmm6 += a2 * b1;
8341 xmm7 += a2 * b2;
8342 xmm8 += a2 * b3;
8343 xmm9 += a2 * b4;
8344 xmm10 += a2 * b5;
8345 }
8346
8347 C.store( i , j , xmm1 * factor );
8348 C.store( i , j+SIMDSIZE , xmm2 * factor );
8349 C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
8350 C.store( i , j+SIMDSIZE*3UL, xmm4 * factor );
8351 C.store( i , j+SIMDSIZE*4UL, xmm5 * factor );
8352 C.store( i+1UL, j , xmm6 * factor );
8353 C.store( i+1UL, j+SIMDSIZE , xmm7 * factor );
8354 C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
8355 C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
8356 C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
8357 }
8358 else
8359 {
8360 const SIMDType zero;
8361 C.store( i , j , zero );
8362 C.store( i , j+SIMDSIZE , zero );
8363 C.store( i , j+SIMDSIZE*2UL, zero );
8364 C.store( i , j+SIMDSIZE*3UL, zero );
8365 C.store( i , j+SIMDSIZE*4UL, zero );
8366 C.store( i+1UL, j , zero );
8367 C.store( i+1UL, j+SIMDSIZE , zero );
8368 C.store( i+1UL, j+SIMDSIZE*2UL, zero );
8369 C.store( i+1UL, j+SIMDSIZE*3UL, zero );
8370 C.store( i+1UL, j+SIMDSIZE*4UL, zero );
8371 }
8372 }
8373
8374 if( i < M )
8375 {
8376 const size_t kbegin( ( IsUpper_v<MT4> )
8377 ?( ( IsLower_v<MT5> )
8378 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8379 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8380 :( IsLower_v<MT5> ? j : 0UL ) );
8381 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
8382
8383 size_t k( kbegin );
8384
8385 if( k < kend )
8386 {
8387 SIMDType a1( set( A(i,k) ) );
8388 SIMDType xmm1( a1 * B.load(k,j ) );
8389 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
8390 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
8391 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
8392 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
8393
8394 for( ++k; k<kend; ++k ) {
8395 a1 = set( A(i,k) );
8396 xmm1 += a1 * B.load(k,j );
8397 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8398 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8399 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8400 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
8401 }
8402
8403 C.store( i, j , xmm1 * factor );
8404 C.store( i, j+SIMDSIZE , xmm2 * factor );
8405 C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8406 C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
8407 C.store( i, j+SIMDSIZE*4UL, xmm5 * factor );
8408 }
8409 else
8410 {
8411 const SIMDType zero;
8412 C.store( i, j , zero );
8413 C.store( i, j+SIMDSIZE , zero );
8414 C.store( i, j+SIMDSIZE*2UL, zero );
8415 C.store( i, j+SIMDSIZE*3UL, zero );
8416 C.store( i, j+SIMDSIZE*4UL, zero );
8417 }
8418 }
8419 }
8420
8421 for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
8422 {
8423 const size_t iend( UPP ? min(j+SIMDSIZE*4UL,M) : M );
8424 size_t i( 0UL );
8425
8426 if( SYM || HERM ) {
8427 const size_t jjend( min(j+SIMDSIZE*4UL,N) );
8428 for( ; i<j; ++i ) {
8429 for( size_t jj=j; jj<jjend; ++jj ) {
8430 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
8431 }
8432 }
8433 }
8434 else if( LOW ) {
8435 const size_t jjend( min(j+SIMDSIZE*4UL,N) );
8436 for( ; i<j; ++i ) {
8437 for( size_t jj=j; jj<jjend; ++jj ) {
8438 reset( C(i,jj) );
8439 }
8440 }
8441 }
8442
8443 for( ; (i+2UL) <= iend; i+=2UL )
8444 {
8445 const size_t kbegin( ( IsUpper_v<MT4> )
8446 ?( ( IsLower_v<MT5> )
8447 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8448 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8449 :( IsLower_v<MT5> ? j : 0UL ) );
8450 const size_t kend( ( IsLower_v<MT4> )
8451 ?( ( IsUpper_v<MT5> )
8452 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
8453 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8454 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
8455
8456 size_t k( kbegin );
8457
8458 if( k < kend )
8459 {
8460 SIMDType a1( set( A(i ,k) ) );
8461 SIMDType a2( set( A(i+1UL,k) ) );
8462 SIMDType b1( B.load(k,j ) );
8463 SIMDType b2( B.load(k,j+SIMDSIZE ) );
8464 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8465 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
8466 SIMDType xmm1( a1 * b1 );
8467 SIMDType xmm2( a1 * b2 );
8468 SIMDType xmm3( a1 * b3 );
8469 SIMDType xmm4( a1 * b4 );
8470 SIMDType xmm5( a2 * b1 );
8471 SIMDType xmm6( a2 * b2 );
8472 SIMDType xmm7( a2 * b3 );
8473 SIMDType xmm8( a2 * b4 );
8474
8475 for( ++k; k<kend; ++k ) {
8476 a1 = set( A(i ,k) );
8477 a2 = set( A(i+1UL,k) );
8478 b1 = B.load(k,j );
8479 b2 = B.load(k,j+SIMDSIZE );
8480 b3 = B.load(k,j+SIMDSIZE*2UL);
8481 b4 = B.load(k,j+SIMDSIZE*3UL);
8482 xmm1 += a1 * b1;
8483 xmm2 += a1 * b2;
8484 xmm3 += a1 * b3;
8485 xmm4 += a1 * b4;
8486 xmm5 += a2 * b1;
8487 xmm6 += a2 * b2;
8488 xmm7 += a2 * b3;
8489 xmm8 += a2 * b4;
8490 }
8491
8492 C.store( i , j , xmm1 * factor );
8493 C.store( i , j+SIMDSIZE , xmm2 * factor );
8494 C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
8495 C.store( i , j+SIMDSIZE*3UL, xmm4 * factor );
8496 C.store( i+1UL, j , xmm5 * factor );
8497 C.store( i+1UL, j+SIMDSIZE , xmm6 * factor );
8498 C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
8499 C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
8500 }
8501 else
8502 {
8503 const SIMDType zero;
8504 C.store( i , j , zero );
8505 C.store( i , j+SIMDSIZE , zero );
8506 C.store( i , j+SIMDSIZE*2UL, zero );
8507 C.store( i , j+SIMDSIZE*3UL, zero );
8508 C.store( i+1UL, j , zero );
8509 C.store( i+1UL, j+SIMDSIZE , zero );
8510 C.store( i+1UL, j+SIMDSIZE*2UL, zero );
8511 C.store( i+1UL, j+SIMDSIZE*3UL, zero );
8512 }
8513 }
8514
8515 if( i < iend )
8516 {
8517 const size_t kbegin( ( IsUpper_v<MT4> )
8518 ?( ( IsLower_v<MT5> )
8519 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8520 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8521 :( IsLower_v<MT5> ? j : 0UL ) );
8522 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
8523
8524 size_t k( kbegin );
8525
8526 if( k < kend )
8527 {
8528 SIMDType a1( set( A(i,k) ) );
8529 SIMDType xmm1( a1 * B.load(k,j ) );
8530 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
8531 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
8532 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
8533
8534 for( ++k; k<kend; ++k ) {
8535 a1 = set( A(i,k) );
8536 xmm1 += a1 * B.load(k,j );
8537 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8538 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8539 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8540 }
8541
8542 C.store( i, j , xmm1 * factor );
8543 C.store( i, j+SIMDSIZE , xmm2 * factor );
8544 C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8545 C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
8546 }
8547 else
8548 {
8549 const SIMDType zero;
8550 C.store( i, j , zero );
8551 C.store( i, j+SIMDSIZE , zero );
8552 C.store( i, j+SIMDSIZE*2UL, zero );
8553 C.store( i, j+SIMDSIZE*3UL, zero );
8554 }
8555
8556 if( UPP ) ++i;
8557 }
8558
8559 if( UPP ) {
8560 const size_t jjend( min(j+SIMDSIZE*4UL,N) );
8561 for( ; i<M; ++i ) {
8562 for( size_t jj=j; jj<jjend; ++jj ) {
8563 reset( C(i,jj) );
8564 }
8565 }
8566 }
8567 }
8568
8569 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
8570 {
8571 const size_t iend( UPP ? min(j+SIMDSIZE*3UL,M) : M );
8572 size_t i( 0UL );
8573
8574 if( SYM || HERM ) {
8575 const size_t jjend( min(j+SIMDSIZE*3UL,N) );
8576 for( ; i<j; ++i ) {
8577 for( size_t jj=j; jj<jjend; ++jj ) {
8578 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
8579 }
8580 }
8581 }
8582 else if( LOW ) {
8583 const size_t jjend( min(j+SIMDSIZE*3UL,N) );
8584 for( ; i<j; ++i ) {
8585 for( size_t jj=j; jj<jjend; ++jj ) {
8586 reset( C(i,jj) );
8587 }
8588 }
8589 }
8590
8591 for( ; (i+2UL) <= iend; i+=2UL )
8592 {
8593 const size_t kbegin( ( IsUpper_v<MT4> )
8594 ?( ( IsLower_v<MT5> )
8595 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8596 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8597 :( IsLower_v<MT5> ? j : 0UL ) );
8598 const size_t kend( ( IsLower_v<MT4> )
8599 ?( ( IsUpper_v<MT5> )
8600 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
8601 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8602 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
8603
8604 size_t k( kbegin );
8605
8606 if( k < kend )
8607 {
8608 SIMDType a1( set( A(i ,k) ) );
8609 SIMDType a2( set( A(i+1UL,k) ) );
8610 SIMDType b1( B.load(k,j ) );
8611 SIMDType b2( B.load(k,j+SIMDSIZE ) );
8612 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8613 SIMDType xmm1( a1 * b1 );
8614 SIMDType xmm2( a1 * b2 );
8615 SIMDType xmm3( a1 * b3 );
8616 SIMDType xmm4( a2 * b1 );
8617 SIMDType xmm5( a2 * b2 );
8618 SIMDType xmm6( a2 * b3 );
8619
8620 for( ++k; k<kend; ++k ) {
8621 a1 = set( A(i ,k) );
8622 a2 = set( A(i+1UL,k) );
8623 b1 = B.load(k,j );
8624 b2 = B.load(k,j+SIMDSIZE );
8625 b3 = B.load(k,j+SIMDSIZE*2UL);
8626 xmm1 += a1 * b1;
8627 xmm2 += a1 * b2;
8628 xmm3 += a1 * b3;
8629 xmm4 += a2 * b1;
8630 xmm5 += a2 * b2;
8631 xmm6 += a2 * b3;
8632 }
8633
8634 C.store( i , j , xmm1 * factor );
8635 C.store( i , j+SIMDSIZE , xmm2 * factor );
8636 C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
8637 C.store( i+1UL, j , xmm4 * factor );
8638 C.store( i+1UL, j+SIMDSIZE , xmm5 * factor );
8639 C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
8640 }
8641 else
8642 {
8643 const SIMDType zero;
8644 C.store( i , j , zero );
8645 C.store( i , j+SIMDSIZE , zero );
8646 C.store( i , j+SIMDSIZE*2UL, zero );
8647 C.store( i+1UL, j , zero );
8648 C.store( i+1UL, j+SIMDSIZE , zero );
8649 C.store( i+1UL, j+SIMDSIZE*2UL, zero );
8650 }
8651 }
8652
8653 if( i < iend )
8654 {
8655 const size_t kbegin( ( IsUpper_v<MT4> )
8656 ?( ( IsLower_v<MT5> )
8657 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8658 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8659 :( IsLower_v<MT5> ? j : 0UL ) );
8660 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
8661
8662 size_t k( kbegin );
8663
8664 if( k < kend )
8665 {
8666 SIMDType a1( set( A(i,k) ) );
8667 SIMDType xmm1( a1 * B.load(k,j ) );
8668 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
8669 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
8670
8671 for( ++k; k<kend; ++k ) {
8672 a1 = set( A(i,k) );
8673 xmm1 += a1 * B.load(k,j );
8674 xmm2 += a1 * B.load(k,j+SIMDSIZE );
8675 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8676 }
8677
8678 C.store( i, j , xmm1 * factor );
8679 C.store( i, j+SIMDSIZE , xmm2 * factor );
8680 C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8681 }
8682 else
8683 {
8684 const SIMDType zero;
8685 C.store( i, j , zero );
8686 C.store( i, j+SIMDSIZE , zero );
8687 C.store( i, j+SIMDSIZE*2UL, zero );
8688 }
8689
8690 if( UPP ) ++i;
8691 }
8692
8693 if( UPP ) {
8694 const size_t jjend( min(j+SIMDSIZE*3UL,N) );
8695 for( ; i<M; ++i ) {
8696 for( size_t jj=j; jj<jjend; ++jj ) {
8697 reset( C(i,jj) );
8698 }
8699 }
8700 }
8701 }
8702
8703 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8704 {
8705 const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
8706 size_t i( 0UL );
8707
8708 if( SYM || HERM ) {
8709 const size_t jjend( min(j+SIMDSIZE*2UL,N) );
8710 for( ; i<j; ++i ) {
8711 for( size_t jj=j; jj<jjend; ++jj ) {
8712 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
8713 }
8714 }
8715 }
8716 else if( LOW ) {
8717 const size_t jjend( min(j+SIMDSIZE*2UL,N) );
8718 for( ; i<j; ++i ) {
8719 for( size_t jj=j; jj<jjend; ++jj ) {
8720 reset( C(i,jj) );
8721 }
8722 }
8723 }
8724
8725 for( ; (i+4UL) <= iend; i+=4UL )
8726 {
8727 const size_t kbegin( ( IsUpper_v<MT4> )
8728 ?( ( IsLower_v<MT5> )
8729 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8730 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8731 :( IsLower_v<MT5> ? j : 0UL ) );
8732 const size_t kend( ( IsLower_v<MT4> )
8733 ?( ( IsUpper_v<MT5> )
8734 ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
8735 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
8736 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8737
8738 size_t k( kbegin );
8739
8740 if( k < kend )
8741 {
8742 SIMDType a1( set( A(i ,k) ) );
8743 SIMDType a2( set( A(i+1UL,k) ) );
8744 SIMDType a3( set( A(i+2UL,k) ) );
8745 SIMDType a4( set( A(i+3UL,k) ) );
8746 SIMDType b1( B.load(k,j ) );
8747 SIMDType b2( B.load(k,j+SIMDSIZE) );
8748 SIMDType xmm1( a1 * b1 );
8749 SIMDType xmm2( a1 * b2 );
8750 SIMDType xmm3( a2 * b1 );
8751 SIMDType xmm4( a2 * b2 );
8752 SIMDType xmm5( a3 * b1 );
8753 SIMDType xmm6( a3 * b2 );
8754 SIMDType xmm7( a4 * b1 );
8755 SIMDType xmm8( a4 * b2 );
8756
8757 for( ++k; k<kend; ++k ) {
8758 a1 = set( A(i ,k) );
8759 a2 = set( A(i+1UL,k) );
8760 a3 = set( A(i+2UL,k) );
8761 a4 = set( A(i+3UL,k) );
8762 b1 = B.load(k,j );
8763 b2 = B.load(k,j+SIMDSIZE);
8764 xmm1 += a1 * b1;
8765 xmm2 += a1 * b2;
8766 xmm3 += a2 * b1;
8767 xmm4 += a2 * b2;
8768 xmm5 += a3 * b1;
8769 xmm6 += a3 * b2;
8770 xmm7 += a4 * b1;
8771 xmm8 += a4 * b2;
8772 }
8773
8774 C.store( i , j , xmm1 * factor );
8775 C.store( i , j+SIMDSIZE, xmm2 * factor );
8776 C.store( i+1UL, j , xmm3 * factor );
8777 C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8778 C.store( i+2UL, j , xmm5 * factor );
8779 C.store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8780 C.store( i+3UL, j , xmm7 * factor );
8781 C.store( i+3UL, j+SIMDSIZE, xmm8 * factor );
8782 }
8783 else
8784 {
8785 const SIMDType zero;
8786 C.store( i , j , zero );
8787 C.store( i , j+SIMDSIZE, zero );
8788 C.store( i+1UL, j , zero );
8789 C.store( i+1UL, j+SIMDSIZE, zero );
8790 C.store( i+2UL, j , zero );
8791 C.store( i+2UL, j+SIMDSIZE, zero );
8792 C.store( i+3UL, j , zero );
8793 C.store( i+3UL, j+SIMDSIZE, zero );
8794 }
8795 }
8796
8797 for( ; (i+3UL) <= iend; i+=3UL )
8798 {
8799 const size_t kbegin( ( IsUpper_v<MT4> )
8800 ?( ( IsLower_v<MT5> )
8801 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8802 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8803 :( IsLower_v<MT5> ? j : 0UL ) );
8804 const size_t kend( ( IsLower_v<MT4> )
8805 ?( ( IsUpper_v<MT5> )
8806 ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
8807 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
8808 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8809
8810 size_t k( kbegin );
8811
8812 if( k < kend )
8813 {
8814 SIMDType a1( set( A(i ,k) ) );
8815 SIMDType a2( set( A(i+1UL,k) ) );
8816 SIMDType a3( set( A(i+2UL,k) ) );
8817 SIMDType b1( B.load(k,j ) );
8818 SIMDType b2( B.load(k,j+SIMDSIZE) );
8819 SIMDType xmm1( a1 * b1 );
8820 SIMDType xmm2( a1 * b2 );
8821 SIMDType xmm3( a2 * b1 );
8822 SIMDType xmm4( a2 * b2 );
8823 SIMDType xmm5( a3 * b1 );
8824 SIMDType xmm6( a3 * b2 );
8825
8826 for( ++k; k<kend; ++k ) {
8827 a1 = set( A(i ,k) );
8828 a2 = set( A(i+1UL,k) );
8829 a3 = set( A(i+2UL,k) );
8830 b1 = B.load(k,j );
8831 b2 = B.load(k,j+SIMDSIZE);
8832 xmm1 += a1 * b1;
8833 xmm2 += a1 * b2;
8834 xmm3 += a2 * b1;
8835 xmm4 += a2 * b2;
8836 xmm5 += a3 * b1;
8837 xmm6 += a3 * b2;
8838 }
8839
8840 C.store( i , j , xmm1 * factor );
8841 C.store( i , j+SIMDSIZE, xmm2 * factor );
8842 C.store( i+1UL, j , xmm3 * factor );
8843 C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8844 C.store( i+2UL, j , xmm5 * factor );
8845 C.store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8846 }
8847 else
8848 {
8849 const SIMDType zero;
8850 C.store( i , j , zero );
8851 C.store( i , j+SIMDSIZE, zero );
8852 C.store( i+1UL, j , zero );
8853 C.store( i+1UL, j+SIMDSIZE, zero );
8854 C.store( i+2UL, j , zero );
8855 C.store( i+2UL, j+SIMDSIZE, zero );
8856 }
8857 }
8858
8859 for( ; (i+2UL) <= iend; i+=2UL )
8860 {
8861 const size_t kbegin( ( IsUpper_v<MT4> )
8862 ?( ( IsLower_v<MT5> )
8863 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8864 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8865 :( IsLower_v<MT5> ? j : 0UL ) );
8866 const size_t kend( ( IsLower_v<MT4> )
8867 ?( ( IsUpper_v<MT5> )
8868 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
8869 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8870 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8871
8872 size_t k( kbegin );
8873
8874 if( k < kend )
8875 {
8876 SIMDType a1( set( A(i ,k) ) );
8877 SIMDType a2( set( A(i+1UL,k) ) );
8878 SIMDType b1( B.load(k,j ) );
8879 SIMDType b2( B.load(k,j+SIMDSIZE) );
8880 SIMDType xmm1( a1 * b1 );
8881 SIMDType xmm2( a1 * b2 );
8882 SIMDType xmm3( a2 * b1 );
8883 SIMDType xmm4( a2 * b2 );
8884
8885 for( ++k; k<kend; ++k ) {
8886 a1 = set( A(i ,k) );
8887 a2 = set( A(i+1UL,k) );
8888 b1 = B.load(k,j );
8889 b2 = B.load(k,j+SIMDSIZE);
8890 xmm1 += a1 * b1;
8891 xmm2 += a1 * b2;
8892 xmm3 += a2 * b1;
8893 xmm4 += a2 * b2;
8894 }
8895
8896 C.store( i , j , xmm1 * factor );
8897 C.store( i , j+SIMDSIZE, xmm2 * factor );
8898 C.store( i+1UL, j , xmm3 * factor );
8899 C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8900 }
8901 else
8902 {
8903 const SIMDType zero;
8904 C.store( i , j , zero );
8905 C.store( i , j+SIMDSIZE, zero );
8906 C.store( i+1UL, j , zero );
8907 C.store( i+1UL, j+SIMDSIZE, zero );
8908 }
8909 }
8910
8911 if( i < iend )
8912 {
8913 const size_t kbegin( ( IsUpper_v<MT4> )
8914 ?( ( IsLower_v<MT5> )
8915 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8916 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8917 :( IsLower_v<MT5> ? j : 0UL ) );
8918 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
8919
8920 size_t k( kbegin );
8921
8922 if( k < kend )
8923 {
8924 SIMDType a1( set( A(i,k) ) );
8925 SIMDType xmm1( a1 * B.load(k,j ) );
8926 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE) );
8927
8928 for( ++k; k<kend; ++k ) {
8929 a1 = set( A(i,k) );
8930 xmm1 += a1 * B.load(k,j );
8931 xmm2 += a1 * B.load(k,j+SIMDSIZE);
8932 }
8933
8934 C.store( i, j , xmm1 * factor );
8935 C.store( i, j+SIMDSIZE, xmm2 * factor );
8936 }
8937 else
8938 {
8939 const SIMDType zero;
8940 C.store( i, j , zero );
8941 C.store( i, j+SIMDSIZE, zero );
8942 }
8943
8944 if( UPP ) ++i;
8945 }
8946
8947 if( UPP ) {
8948 const size_t jjend( min(j+SIMDSIZE*2UL,N) );
8949 for( ; i<M; ++i ) {
8950 for( size_t jj=j; jj<jjend; ++jj ) {
8951 reset( C(i,jj) );
8952 }
8953 }
8954 }
8955 }
8956
8957 for( ; j<jpos; j+=SIMDSIZE )
8958 {
8959 const size_t iend( UPP ? min(j+SIMDSIZE,M) : M );
8960 size_t i( 0UL );
8961
8962 if( SYM || HERM ) {
8963 const size_t jjend( min(j+SIMDSIZE,N) );
8964 for( ; i<j; ++i ) {
8965 for( size_t jj=j; jj<jjend; ++jj ) {
8966 C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
8967 }
8968 }
8969 }
8970 else if( LOW ) {
8971 const size_t jjend( min(j+SIMDSIZE,N) );
8972 for( ; i<j; ++i ) {
8973 for( size_t jj=j; jj<jjend; ++jj ) {
8974 reset( C(i,jj) );
8975 }
8976 }
8977 }
8978
8979 for( ; (i+4UL) <= iend; i+=4UL )
8980 {
8981 const size_t kbegin( ( IsUpper_v<MT4> )
8982 ?( ( IsLower_v<MT5> )
8983 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8984 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8985 :( IsLower_v<MT5> ? j : 0UL ) );
8986 const size_t kend( ( IsLower_v<MT4> )
8987 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
8988 :( K ) );
8989
8990 size_t k( kbegin );
8991
8992 if( k < kend )
8993 {
8994 SIMDType b1( B.load(k,j) );
8995 SIMDType xmm1( set( A(i ,k) ) * b1 );
8996 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
8997 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
8998 SIMDType xmm4( set( A(i+3UL,k) ) * b1 );
8999
9000 for( ++k; k<kend; ++k ) {
9001 b1 = B.load(k,j);
9002 xmm1 += set( A(i ,k) ) * b1;
9003 xmm2 += set( A(i+1UL,k) ) * b1;
9004 xmm3 += set( A(i+2UL,k) ) * b1;
9005 xmm4 += set( A(i+3UL,k) ) * b1;
9006 }
9007
9008 C.store( i , j, xmm1 * factor );
9009 C.store( i+1UL, j, xmm2 * factor );
9010 C.store( i+2UL, j, xmm3 * factor );
9011 C.store( i+3UL, j, xmm4 * factor );
9012 }
9013 else
9014 {
9015 const SIMDType zero;
9016 C.store( i , j, zero );
9017 C.store( i+1UL, j, zero );
9018 C.store( i+2UL, j, zero );
9019 C.store( i+3UL, j, zero );
9020 }
9021 }
9022
9023 for( ; (i+3UL) <= iend; i+=3UL )
9024 {
9025 const size_t kbegin( ( IsUpper_v<MT4> )
9026 ?( ( IsLower_v<MT5> )
9027 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9028 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9029 :( IsLower_v<MT5> ? j : 0UL ) );
9030 const size_t kend( ( IsLower_v<MT4> )
9031 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
9032 :( K ) );
9033
9034 size_t k( kbegin );
9035
9036 if( k < kend )
9037 {
9038 SIMDType b1( B.load(k,j) );
9039 SIMDType xmm1( set( A(i ,k) ) * b1 );
9040 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
9041 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
9042
9043 for( ++k; k<kend; ++k ) {
9044 b1 = B.load(k,j);
9045 xmm1 += set( A(i ,k) ) * b1;
9046 xmm2 += set( A(i+1UL,k) ) * b1;
9047 xmm3 += set( A(i+2UL,k) ) * b1;
9048 }
9049
9050 C.store( i , j, xmm1 * factor );
9051 C.store( i+1UL, j, xmm2 * factor );
9052 C.store( i+2UL, j, xmm3 * factor );
9053 }
9054 else
9055 {
9056 const SIMDType zero;
9057 C.store( i , j, zero );
9058 C.store( i+1UL, j, zero );
9059 C.store( i+2UL, j, zero );
9060 }
9061 }
9062
9063 for( ; (i+2UL) <= iend; i+=2UL )
9064 {
9065 const size_t kbegin( ( IsUpper_v<MT4> )
9066 ?( ( IsLower_v<MT5> )
9067 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9068 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9069 :( IsLower_v<MT5> ? j : 0UL ) );
9070 const size_t kend( ( IsLower_v<MT4> )
9071 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
9072 :( K ) );
9073
9074 size_t k( kbegin );
9075
9076 if( k < kend )
9077 {
9078 SIMDType b1( B.load(k,j) );
9079 SIMDType xmm1( set( A(i ,k) ) * b1 );
9080 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
9081
9082 for( ++k; k<kend; ++k ) {
9083 b1 = B.load(k,j);
9084 xmm1 += set( A(i ,k) ) * b1;
9085 xmm2 += set( A(i+1UL,k) ) * b1;
9086 }
9087
9088 C.store( i , j, xmm1 * factor );
9089 C.store( i+1UL, j, xmm2 * factor );
9090 }
9091 else
9092 {
9093 const SIMDType zero;
9094 C.store( i , j, zero );
9095 C.store( i+1UL, j, zero );
9096 }
9097 }
9098
9099 if( i < iend )
9100 {
9101 const size_t kbegin( ( IsUpper_v<MT4> )
9102 ?( ( IsLower_v<MT5> )
9103 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9104 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9105 :( IsLower_v<MT5> ? j : 0UL ) );
9106
9107 size_t k( kbegin );
9108
9109 if( k < K )
9110 {
9111 SIMDType xmm1( set( A(i,k) ) * B.load(k,j) );
9112
9113 for( ++k; k<K; ++k ) {
9114 xmm1 += set( A(i,k) ) * B.load(k,j);
9115 }
9116
9117 C.store( i, j, xmm1 * factor );
9118 }
9119 else
9120 {
9121 const SIMDType zero;
9122 C.store( i, j, zero );
9123 }
9124
9125 if( UPP ) ++i;
9126 }
9127
9128 if( UPP ) {
9129 const size_t jjend( min(j+SIMDSIZE,N) );
9130 for( ; i<M; ++i ) {
9131 for( size_t jj=j; jj<jjend; ++jj ) {
9132 reset( C(i,jj) );
9133 }
9134 }
9135 }
9136 }
9137
9138 for( ; remainder && j<N; ++j )
9139 {
9140 size_t i( 0UL );
9141
9142 if( SYM || HERM ) {
9143 for( ; i<j; ++i ) {
9144 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
9145 }
9146 }
9147 else if( LOW ) {
9148 for( ; i<j; ++i ) {
9149 reset( C(i,j) );
9150 }
9151 }
9152
9153 for( ; (i+2UL) <= M; i+=2UL )
9154 {
9155 const size_t kbegin( ( IsUpper_v<MT4> )
9156 ?( ( IsLower_v<MT5> )
9157 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9158 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9159 :( IsLower_v<MT5> ? j : 0UL ) );
9160 const size_t kend( ( IsLower_v<MT4> )
9161 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
9162 :( K ) );
9163
9164 size_t k( kbegin );
9165
9166 if( k < kend )
9167 {
9168 ElementType value1( A(i ,k) * B(k,j) );
9169 ElementType value2( A(i+1UL,k) * B(k,j) );
9170
9171 for( ++k; k<kend; ++k ) {
9172 value1 += A(i ,k) * B(k,j);
9173 value2 += A(i+1UL,k) * B(k,j);
9174 }
9175
9176 C(i ,j) = value1 * scalar;
9177 C(i+1UL,j) = value2 * scalar;
9178 }
9179 else
9180 {
9181 reset( C(i ,j) );
9182 reset( C(i+1UL,j) );
9183 }
9184 }
9185
9186 if( i < M )
9187 {
9188 const size_t kbegin( ( IsUpper_v<MT4> )
9189 ?( ( IsLower_v<MT5> )
9190 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9191 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9192 :( IsLower_v<MT5> ? j : 0UL ) );
9193
9194 size_t k( kbegin );
9195
9196 if( k < K )
9197 {
9198 ElementType value( A(i,k) * B(k,j) );
9199
9200 for( ++k; k<K; ++k ) {
9201 value += A(i,k) * B(k,j);
9202 }
9203
9204 C(i,j) = value * scalar;
9205 }
9206 else
9207 {
9208 reset( C(i,j) );
9209 }
9210 }
9211 }
9212 }
9213 //**********************************************************************************************
9214
9215 //**Vectorized default assignment to column-major dense matrices (small matrices)***************
9230 template< typename MT3 // Type of the left-hand side target matrix
9231 , typename MT4 // Type of the left-hand side matrix operand
9232 , typename MT5 // Type of the right-hand side matrix operand
9233 , typename ST2 > // Type of the scalar value
9234 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9235 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9236 {
9237 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
9238
9239 const size_t M( A.rows() );
9240 const size_t N( B.columns() );
9241 const size_t K( A.columns() );
9242
9243 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
9244
9245 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
9246 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
9247
9248 const SIMDType factor( set( scalar ) );
9249
9250 size_t i( 0UL );
9251
9252 if( IsIntegral_v<ElementType> )
9253 {
9254 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
9255 for( size_t j=0UL; j<N; ++j )
9256 {
9257 const size_t kbegin( ( IsLower_v<MT5> )
9258 ?( ( IsUpper_v<MT4> )
9259 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9260 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9261 :( IsUpper_v<MT4> ? i : 0UL ) );
9262 const size_t kend( ( IsUpper_v<MT5> )
9263 ?( ( IsLower_v<MT4> )
9264 ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
9265 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
9266 :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
9267
9268 size_t k( kbegin );
9269
9270 if( k < kend )
9271 {
9272 SIMDType b1( set( B(k,j) ) );
9273 SIMDType xmm1( A.load(i ,k) * b1 );
9274 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
9275 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
9276 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
9277 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
9278 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,k) * b1 );
9279 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,k) * b1 );
9280 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,k) * b1 );
9281
9282 for( ++k; k<kend; ++k ) {
9283 b1 = set( B(k,j) );
9284 xmm1 += A.load(i ,k) * b1;
9285 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9286 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9287 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
9288 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
9289 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
9290 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
9291 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
9292 }
9293
9294 C.store( i , j, xmm1 * factor );
9295 C.store( i+SIMDSIZE , j, xmm2 * factor );
9296 C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
9297 C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
9298 C.store( i+SIMDSIZE*4UL, j, xmm5 * factor );
9299 C.store( i+SIMDSIZE*5UL, j, xmm6 * factor );
9300 C.store( i+SIMDSIZE*6UL, j, xmm7 * factor );
9301 C.store( i+SIMDSIZE*7UL, j, xmm8 * factor );
9302 }
9303 else
9304 {
9305 const SIMDType zero;
9306 C.store( i , j, zero );
9307 C.store( i+SIMDSIZE , j, zero );
9308 C.store( i+SIMDSIZE*2UL, j, zero );
9309 C.store( i+SIMDSIZE*3UL, j, zero );
9310 C.store( i+SIMDSIZE*4UL, j, zero );
9311 C.store( i+SIMDSIZE*5UL, j, zero );
9312 C.store( i+SIMDSIZE*6UL, j, zero );
9313 C.store( i+SIMDSIZE*7UL, j, zero );
9314 }
9315 }
9316 }
9317 }
9318
9319 for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
9320 {
9321 size_t j( 0UL );
9322
9323 for( ; (j+2UL) <= N; j+=2UL )
9324 {
9325 const size_t kbegin( ( IsLower_v<MT5> )
9326 ?( ( IsUpper_v<MT4> )
9327 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9328 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9329 :( IsUpper_v<MT4> ? i : 0UL ) );
9330 const size_t kend( ( IsUpper_v<MT5> )
9331 ?( ( IsLower_v<MT4> )
9332 ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
9333 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
9334 :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
9335
9336 size_t k( kbegin );
9337
9338 if( k < kend )
9339 {
9340 SIMDType a1( A.load(i ,k) );
9341 SIMDType a2( A.load(i+SIMDSIZE ,k) );
9342 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
9343 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
9344 SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
9345 SIMDType b1( set( B(k,j ) ) );
9346 SIMDType b2( set( B(k,j+1UL) ) );
9347 SIMDType xmm1 ( a1 * b1 );
9348 SIMDType xmm2 ( a2 * b1 );
9349 SIMDType xmm3 ( a3 * b1 );
9350 SIMDType xmm4 ( a4 * b1 );
9351 SIMDType xmm5 ( a5 * b1 );
9352 SIMDType xmm6 ( a1 * b2 );
9353 SIMDType xmm7 ( a2 * b2 );
9354 SIMDType xmm8 ( a3 * b2 );
9355 SIMDType xmm9 ( a4 * b2 );
9356 SIMDType xmm10( a5 * b2 );
9357
9358 for( ++k; k<kend; ++k ) {
9359 a1 = A.load(i ,k);
9360 a2 = A.load(i+SIMDSIZE ,k);
9361 a3 = A.load(i+SIMDSIZE*2UL,k);
9362 a4 = A.load(i+SIMDSIZE*3UL,k);
9363 a5 = A.load(i+SIMDSIZE*4UL,k);
9364 b1 = set( B(k,j ) );
9365 b2 = set( B(k,j+1UL) );
9366 xmm1 += a1 * b1;
9367 xmm2 += a2 * b1;
9368 xmm3 += a3 * b1;
9369 xmm4 += a4 * b1;
9370 xmm5 += a5 * b1;
9371 xmm6 += a1 * b2;
9372 xmm7 += a2 * b2;
9373 xmm8 += a3 * b2;
9374 xmm9 += a4 * b2;
9375 xmm10 += a5 * b2;
9376 }
9377
9378 C.store( i , j , xmm1 * factor );
9379 C.store( i+SIMDSIZE , j , xmm2 * factor );
9380 C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
9381 C.store( i+SIMDSIZE*3UL, j , xmm4 * factor );
9382 C.store( i+SIMDSIZE*4UL, j , xmm5 * factor );
9383 C.store( i , j+1UL, xmm6 * factor );
9384 C.store( i+SIMDSIZE , j+1UL, xmm7 * factor );
9385 C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
9386 C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
9387 C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
9388 }
9389 else
9390 {
9391 const SIMDType zero;
9392 C.store( i , j , zero );
9393 C.store( i+SIMDSIZE , j , zero );
9394 C.store( i+SIMDSIZE*2UL, j , zero );
9395 C.store( i+SIMDSIZE*3UL, j , zero );
9396 C.store( i+SIMDSIZE*4UL, j , zero );
9397 C.store( i , j+1UL, zero );
9398 C.store( i+SIMDSIZE , j+1UL, zero );
9399 C.store( i+SIMDSIZE*2UL, j+1UL, zero );
9400 C.store( i+SIMDSIZE*3UL, j+1UL, zero );
9401 C.store( i+SIMDSIZE*4UL, j+1UL, zero );
9402 }
9403 }
9404
9405 if( j < N )
9406 {
9407 const size_t kbegin( ( IsLower_v<MT5> )
9408 ?( ( IsUpper_v<MT4> )
9409 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9410 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9411 :( IsUpper_v<MT4> ? i : 0UL ) );
9412 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
9413
9414 size_t k( kbegin );
9415
9416 if( k < kend )
9417 {
9418 SIMDType b1( set( B(k,j) ) );
9419 SIMDType xmm1( A.load(i ,k) * b1 );
9420 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
9421 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
9422 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
9423 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
9424
9425 for( ++k; k<kend; ++k ) {
9426 b1 = set( B(k,j) );
9427 xmm1 += A.load(i ,k) * b1;
9428 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9429 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9430 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
9431 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
9432 }
9433
9434 C.store( i , j, xmm1 * factor );
9435 C.store( i+SIMDSIZE , j, xmm2 * factor );
9436 C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
9437 C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
9438 C.store( i+SIMDSIZE*4UL, j, xmm5 * factor );
9439 }
9440 else
9441 {
9442 const SIMDType zero;
9443 C.store( i , j, zero );
9444 C.store( i+SIMDSIZE , j, zero );
9445 C.store( i+SIMDSIZE*2UL, j, zero );
9446 C.store( i+SIMDSIZE*3UL, j, zero );
9447 C.store( i+SIMDSIZE*4UL, j, zero );
9448 }
9449 }
9450 }
9451
9452 for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
9453 {
9454 const size_t jend( LOW ? min(i+SIMDSIZE*4UL,N) : N );
9455 size_t j( 0UL );
9456
9457 if( SYM || HERM ) {
9458 const size_t iiend( min(i+SIMDSIZE*4UL,M) );
9459 for( ; j<i; ++j ) {
9460 for( size_t ii=i; ii<iiend; ++ii ) {
9461 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
9462 }
9463 }
9464 }
9465 else if( UPP ) {
9466 const size_t iiend( min(i+SIMDSIZE*4UL,M) );
9467 for( ; j<i; ++j ) {
9468 for( size_t ii=i; ii<iiend; ++ii ) {
9469 reset( C(ii,j) );
9470 }
9471 }
9472 }
9473
9474 for( ; (j+2UL) <= jend; j+=2UL )
9475 {
9476 const size_t kbegin( ( IsLower_v<MT5> )
9477 ?( ( IsUpper_v<MT4> )
9478 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9479 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9480 :( IsUpper_v<MT4> ? i : 0UL ) );
9481 const size_t kend( ( IsUpper_v<MT5> )
9482 ?( ( IsLower_v<MT4> )
9483 ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
9484 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
9485 :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
9486
9487 size_t k( kbegin );
9488
9489 if( k < kend )
9490 {
9491 SIMDType a1( A.load(i ,k) );
9492 SIMDType a2( A.load(i+SIMDSIZE ,k) );
9493 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
9494 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
9495 SIMDType b1( set( B(k,j ) ) );
9496 SIMDType b2( set( B(k,j+1UL) ) );
9497 SIMDType xmm1( a1 * b1 );
9498 SIMDType xmm2( a2 * b1 );
9499 SIMDType xmm3( a3 * b1 );
9500 SIMDType xmm4( a4 * b1 );
9501 SIMDType xmm5( a1 * b2 );
9502 SIMDType xmm6( a2 * b2 );
9503 SIMDType xmm7( a3 * b2 );
9504 SIMDType xmm8( a4 * b2 );
9505
9506 for( ++k; k<kend; ++k ) {
9507 a1 = A.load(i ,k);
9508 a2 = A.load(i+SIMDSIZE ,k);
9509 a3 = A.load(i+SIMDSIZE*2UL,k);
9510 a4 = A.load(i+SIMDSIZE*3UL,k);
9511 b1 = set( B(k,j ) );
9512 b2 = set( B(k,j+1UL) );
9513 xmm1 += a1 * b1;
9514 xmm2 += a2 * b1;
9515 xmm3 += a3 * b1;
9516 xmm4 += a4 * b1;
9517 xmm5 += a1 * b2;
9518 xmm6 += a2 * b2;
9519 xmm7 += a3 * b2;
9520 xmm8 += a4 * b2;
9521 }
9522
9523 C.store( i , j , xmm1 * factor );
9524 C.store( i+SIMDSIZE , j , xmm2 * factor );
9525 C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
9526 C.store( i+SIMDSIZE*3UL, j , xmm4 * factor );
9527 C.store( i , j+1UL, xmm5 * factor );
9528 C.store( i+SIMDSIZE , j+1UL, xmm6 * factor );
9529 C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
9530 C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
9531 }
9532 else
9533 {
9534 const SIMDType zero;
9535 C.store( i , j , zero );
9536 C.store( i+SIMDSIZE , j , zero );
9537 C.store( i+SIMDSIZE*2UL, j , zero );
9538 C.store( i+SIMDSIZE*3UL, j , zero );
9539 C.store( i , j+1UL, zero );
9540 C.store( i+SIMDSIZE , j+1UL, zero );
9541 C.store( i+SIMDSIZE*2UL, j+1UL, zero );
9542 C.store( i+SIMDSIZE*3UL, j+1UL, zero );
9543 }
9544 }
9545
9546 if( j < jend )
9547 {
9548 const size_t kbegin( ( IsLower_v<MT5> )
9549 ?( ( IsUpper_v<MT4> )
9550 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9551 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9552 :( IsUpper_v<MT4> ? i : 0UL ) );
9553 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
9554
9555 size_t k( kbegin );
9556
9557 if( k < kend )
9558 {
9559 SIMDType b1( set( B(k,j) ) );
9560 SIMDType xmm1( A.load(i ,k) * b1 );
9561 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
9562 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
9563 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
9564
9565 for( ++k; k<kend; ++k ) {
9566 b1 = set( B(k,j) );
9567 xmm1 += A.load(i ,k) * b1;
9568 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9569 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9570 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
9571 }
9572
9573 C.store( i , j, xmm1 * factor );
9574 C.store( i+SIMDSIZE , j, xmm2 * factor );
9575 C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
9576 C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
9577 }
9578 else
9579 {
9580 const SIMDType zero;
9581 C.store( i , j, zero );
9582 C.store( i+SIMDSIZE , j, zero );
9583 C.store( i+SIMDSIZE*2UL, j, zero );
9584 C.store( i+SIMDSIZE*3UL, j, zero );
9585 }
9586
9587 if( LOW ) ++j;
9588 }
9589
9590 if( LOW ) {
9591 const size_t iiend( min(i+SIMDSIZE*4UL,M) );
9592 for( ; j<N; ++j ) {
9593 for( size_t ii=i; ii<iiend; ++ii ) {
9594 reset( C(ii,j) );
9595 }
9596 }
9597 }
9598 }
9599
9600 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
9601 {
9602 const size_t jend( LOW ? min(i+SIMDSIZE*3UL,N) : N );
9603 size_t j( 0UL );
9604
9605 if( SYM || HERM ) {
9606 const size_t iiend( min(i+SIMDSIZE*3UL,M) );
9607 for( ; j<i; ++j ) {
9608 for( size_t ii=i; ii<iiend; ++ii ) {
9609 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
9610 }
9611 }
9612 }
9613 else if( UPP ) {
9614 const size_t iiend( min(i+SIMDSIZE*3UL,M) );
9615 for( ; j<i; ++j ) {
9616 for( size_t ii=i; ii<iiend; ++ii ) {
9617 reset( C(ii,j) );
9618 }
9619 }
9620 }
9621
9622 for( ; (j+2UL) <= jend; j+=2UL )
9623 {
9624 const size_t kbegin( ( IsLower_v<MT5> )
9625 ?( ( IsUpper_v<MT4> )
9626 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9627 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9628 :( IsUpper_v<MT4> ? i : 0UL ) );
9629 const size_t kend( ( IsUpper_v<MT5> )
9630 ?( ( IsLower_v<MT4> )
9631 ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
9632 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
9633 :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
9634
9635 size_t k( kbegin );
9636
9637 if( k < kend )
9638 {
9639 SIMDType a1( A.load(i ,k) );
9640 SIMDType a2( A.load(i+SIMDSIZE ,k) );
9641 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
9642 SIMDType b1( set( B(k,j ) ) );
9643 SIMDType b2( set( B(k,j+1UL) ) );
9644 SIMDType xmm1( a1 * b1 );
9645 SIMDType xmm2( a2 * b1 );
9646 SIMDType xmm3( a3 * b1 );
9647 SIMDType xmm4( a1 * b2 );
9648 SIMDType xmm5( a2 * b2 );
9649 SIMDType xmm6( a3 * b2 );
9650
9651 for( ++k; k<kend; ++k ) {
9652 a1 = A.load(i ,k);
9653 a2 = A.load(i+SIMDSIZE ,k);
9654 a3 = A.load(i+SIMDSIZE*2UL,k);
9655 b1 = set( B(k,j ) );
9656 b2 = set( B(k,j+1UL) );
9657 xmm1 += a1 * b1;
9658 xmm2 += a2 * b1;
9659 xmm3 += a3 * b1;
9660 xmm4 += a1 * b2;
9661 xmm5 += a2 * b2;
9662 xmm6 += a3 * b2;
9663 }
9664
9665 C.store( i , j , xmm1 * factor );
9666 C.store( i+SIMDSIZE , j , xmm2 * factor );
9667 C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
9668 C.store( i , j+1UL, xmm4 * factor );
9669 C.store( i+SIMDSIZE , j+1UL, xmm5 * factor );
9670 C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
9671 }
9672 else
9673 {
9674 const SIMDType zero;
9675 C.store( i , j , zero );
9676 C.store( i+SIMDSIZE , j , zero );
9677 C.store( i+SIMDSIZE*2UL, j , zero );
9678 C.store( i , j+1UL, zero );
9679 C.store( i+SIMDSIZE , j+1UL, zero );
9680 C.store( i+SIMDSIZE*2UL, j+1UL, zero );
9681 }
9682 }
9683
9684 if( j < jend )
9685 {
9686 const size_t kbegin( ( IsLower_v<MT5> )
9687 ?( ( IsUpper_v<MT4> )
9688 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9689 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9690 :( IsUpper_v<MT4> ? i : 0UL ) );
9691 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
9692
9693 size_t k( kbegin );
9694
9695 if( k < kend )
9696 {
9697 SIMDType b1( set( B(k,j) ) );
9698 SIMDType xmm1( A.load(i ,k) * b1 );
9699 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
9700 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
9701
9702 for( ++k; k<kend; ++k ) {
9703 b1 = set( B(k,j) );
9704 xmm1 += A.load(i ,k) * b1;
9705 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9706 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9707 }
9708
9709 C.store( i , j, xmm1 * factor );
9710 C.store( i+SIMDSIZE , j, xmm2 * factor );
9711 C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
9712 }
9713 else
9714 {
9715 const SIMDType zero;
9716 C.store( i , j, zero );
9717 C.store( i+SIMDSIZE , j, zero );
9718 C.store( i+SIMDSIZE*2UL, j, zero );
9719 }
9720
9721 if( LOW ) ++j;
9722 }
9723
9724 if( LOW ) {
9725 const size_t iiend( min(i+SIMDSIZE*3UL,M) );
9726 for( ; j<N; ++j ) {
9727 for( size_t ii=i; ii<iiend; ++ii ) {
9728 reset( C(ii,j) );
9729 }
9730 }
9731 }
9732 }
9733
9734 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
9735 {
9736 const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
9737 size_t j( 0UL );
9738
9739 if( SYM || HERM ) {
9740 const size_t iiend( min(i+SIMDSIZE*2UL,M) );
9741 for( ; j<i; ++j ) {
9742 for( size_t ii=i; ii<iiend; ++ii ) {
9743 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
9744 }
9745 }
9746 }
9747 else if( UPP ) {
9748 const size_t iiend( min(i+SIMDSIZE*2UL,M) );
9749 for( ; j<i; ++j ) {
9750 for( size_t ii=i; ii<iiend; ++ii ) {
9751 reset( C(ii,j) );
9752 }
9753 }
9754 }
9755
9756 for( ; (j+4UL) <= jend; j+=4UL )
9757 {
9758 const size_t kbegin( ( IsLower_v<MT5> )
9759 ?( ( IsUpper_v<MT4> )
9760 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9761 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9762 :( IsUpper_v<MT4> ? i : 0UL ) );
9763 const size_t kend( ( IsUpper_v<MT5> )
9764 ?( ( IsLower_v<MT4> )
9765 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
9766 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
9767 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
9768
9769 size_t k( kbegin );
9770
9771 if( k < kend )
9772 {
9773 SIMDType a1( A.load(i ,k) );
9774 SIMDType a2( A.load(i+SIMDSIZE,k) );
9775 SIMDType b1( set( B(k,j ) ) );
9776 SIMDType b2( set( B(k,j+1UL) ) );
9777 SIMDType b3( set( B(k,j+2UL) ) );
9778 SIMDType b4( set( B(k,j+3UL) ) );
9779 SIMDType xmm1( a1 * b1 );
9780 SIMDType xmm2( a2 * b1 );
9781 SIMDType xmm3( a1 * b2 );
9782 SIMDType xmm4( a2 * b2 );
9783 SIMDType xmm5( a1 * b3 );
9784 SIMDType xmm6( a2 * b3 );
9785 SIMDType xmm7( a1 * b4 );
9786 SIMDType xmm8( a2 * b4 );
9787
9788 for( ++k; k<kend; ++k ) {
9789 a1 = A.load(i ,k);
9790 a2 = A.load(i+SIMDSIZE,k);
9791 b1 = set( B(k,j ) );
9792 b2 = set( B(k,j+1UL) );
9793 b3 = set( B(k,j+2UL) );
9794 b4 = set( B(k,j+3UL) );
9795 xmm1 += a1 * b1;
9796 xmm2 += a2 * b1;
9797 xmm3 += a1 * b2;
9798 xmm4 += a2 * b2;
9799 xmm5 += a1 * b3;
9800 xmm6 += a2 * b3;
9801 xmm7 += a1 * b4;
9802 xmm8 += a2 * b4;
9803 }
9804
9805 C.store( i , j , xmm1 * factor );
9806 C.store( i+SIMDSIZE, j , xmm2 * factor );
9807 C.store( i , j+1UL, xmm3 * factor );
9808 C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
9809 C.store( i , j+2UL, xmm5 * factor );
9810 C.store( i+SIMDSIZE, j+2UL, xmm6 * factor );
9811 C.store( i , j+3UL, xmm7 * factor );
9812 C.store( i+SIMDSIZE, j+3UL, xmm8 * factor );
9813 }
9814 else
9815 {
9816 const SIMDType zero;
9817 C.store( i , j , zero );
9818 C.store( i+SIMDSIZE, j , zero );
9819 C.store( i , j+1UL, zero );
9820 C.store( i+SIMDSIZE, j+1UL, zero );
9821 C.store( i , j+2UL, zero );
9822 C.store( i+SIMDSIZE, j+2UL, zero );
9823 C.store( i , j+3UL, zero );
9824 C.store( i+SIMDSIZE, j+3UL, zero );
9825 }
9826 }
9827
9828 for( ; (j+3UL) <= jend; j+=3UL )
9829 {
9830 const size_t kbegin( ( IsLower_v<MT5> )
9831 ?( ( IsUpper_v<MT4> )
9832 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9833 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9834 :( IsUpper_v<MT4> ? i : 0UL ) );
9835 const size_t kend( ( IsUpper_v<MT5> )
9836 ?( ( IsLower_v<MT4> )
9837 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
9838 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
9839 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
9840
9841 size_t k( kbegin );
9842
9843 if( k < kend )
9844 {
9845 SIMDType a1( A.load(i ,k) );
9846 SIMDType a2( A.load(i+SIMDSIZE,k) );
9847 SIMDType b1( set( B(k,j ) ) );
9848 SIMDType b2( set( B(k,j+1UL) ) );
9849 SIMDType b3( set( B(k,j+2UL) ) );
9850 SIMDType xmm1( a1 * b1 );
9851 SIMDType xmm2( a2 * b1 );
9852 SIMDType xmm3( a1 * b2 );
9853 SIMDType xmm4( a2 * b2 );
9854 SIMDType xmm5( a1 * b3 );
9855 SIMDType xmm6( a2 * b3 );
9856
9857 for( ++k; k<kend; ++k ) {
9858 a1 = A.load(i ,k);
9859 a2 = A.load(i+SIMDSIZE,k);
9860 b1 = set( B(k,j ) );
9861 b2 = set( B(k,j+1UL) );
9862 b3 = set( B(k,j+2UL) );
9863 xmm1 += a1 * b1;
9864 xmm2 += a2 * b1;
9865 xmm3 += a1 * b2;
9866 xmm4 += a2 * b2;
9867 xmm5 += a1 * b3;
9868 xmm6 += a2 * b3;
9869 }
9870
9871 C.store( i , j , xmm1 * factor );
9872 C.store( i+SIMDSIZE, j , xmm2 * factor );
9873 C.store( i , j+1UL, xmm3 * factor );
9874 C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
9875 C.store( i , j+2UL, xmm5 * factor );
9876 C.store( i+SIMDSIZE, j+2UL, xmm6 * factor );
9877 }
9878 else
9879 {
9880 const SIMDType zero;
9881 C.store( i , j , zero );
9882 C.store( i+SIMDSIZE, j , zero );
9883 C.store( i , j+1UL, zero );
9884 C.store( i+SIMDSIZE, j+1UL, zero );
9885 C.store( i , j+2UL, zero );
9886 C.store( i+SIMDSIZE, j+2UL, zero );
9887 }
9888 }
9889
9890 for( ; (j+2UL) <= jend; j+=2UL )
9891 {
9892 const size_t kbegin( ( IsLower_v<MT5> )
9893 ?( ( IsUpper_v<MT4> )
9894 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9895 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9896 :( IsUpper_v<MT4> ? i : 0UL ) );
9897 const size_t kend( ( IsUpper_v<MT5> )
9898 ?( ( IsLower_v<MT4> )
9899 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
9900 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
9901 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
9902
9903 size_t k( kbegin );
9904
9905 if( k < kend )
9906 {
9907 SIMDType a1( A.load(i ,k) );
9908 SIMDType a2( A.load(i+SIMDSIZE,k) );
9909 SIMDType b1( set( B(k,j ) ) );
9910 SIMDType b2( set( B(k,j+1UL) ) );
9911 SIMDType xmm1( a1 * b1 );
9912 SIMDType xmm2( a2 * b1 );
9913 SIMDType xmm3( a1 * b2 );
9914 SIMDType xmm4( a2 * b2 );
9915
9916 for( ++k; k<kend; ++k ) {
9917 a1 = A.load(i ,k);
9918 a2 = A.load(i+SIMDSIZE,k);
9919 b1 = set( B(k,j ) );
9920 b2 = set( B(k,j+1UL) );
9921 xmm1 += a1 * b1;
9922 xmm2 += a2 * b1;
9923 xmm3 += a1 * b2;
9924 xmm4 += a2 * b2;
9925 }
9926
9927 C.store( i , j , xmm1 * factor );
9928 C.store( i+SIMDSIZE, j , xmm2 * factor );
9929 C.store( i , j+1UL, xmm3 * factor );
9930 C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
9931 }
9932 else
9933 {
9934 const SIMDType zero;
9935 C.store( i , j , zero );
9936 C.store( i+SIMDSIZE, j , zero );
9937 C.store( i , j+1UL, zero );
9938 C.store( i+SIMDSIZE, j+1UL, zero );
9939 }
9940 }
9941
9942 if( j < jend )
9943 {
9944 const size_t kbegin( ( IsLower_v<MT5> )
9945 ?( ( IsUpper_v<MT4> )
9946 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9947 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9948 :( IsUpper_v<MT4> ? i : 0UL ) );
9949 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
9950
9951 size_t k( kbegin );
9952
9953 if( k < kend )
9954 {
9955 SIMDType b1( set( B(k,j) ) );
9956 SIMDType xmm1( A.load(i ,k) * b1 );
9957 SIMDType xmm2( A.load(i+SIMDSIZE,k) * b1 );
9958
9959 for( ++k; k<kend; ++k ) {
9960 b1 = set( B(k,j) );
9961 xmm1 += A.load(i ,k) * b1;
9962 xmm2 += A.load(i+SIMDSIZE,k) * b1;
9963 }
9964
9965 C.store( i , j, xmm1 * factor );
9966 C.store( i+SIMDSIZE, j, xmm2 * factor );
9967 }
9968 else
9969 {
9970 const SIMDType zero;
9971 C.store( i , j, zero );
9972 C.store( i+SIMDSIZE, j, zero );
9973 }
9974
9975 if( LOW ) ++j;
9976 }
9977
9978 if( LOW ) {
9979 const size_t iiend( min(i+SIMDSIZE*2UL,M) );
9980 for( ; j<N; ++j ) {
9981 for( size_t ii=i; ii<iiend; ++ii ) {
9982 reset( C(ii,j) );
9983 }
9984 }
9985 }
9986 }
9987
9988 for( ; i<ipos; i+=SIMDSIZE )
9989 {
9990 const size_t jend( LOW ? min(i+SIMDSIZE,N) : N );
9991 size_t j( 0UL );
9992
9993 if( SYM || HERM ) {
9994 const size_t iiend( min(i+SIMDSIZE,M) );
9995 for( ; j<i; ++j ) {
9996 for( size_t ii=i; ii<iiend; ++ii ) {
9997 C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
9998 }
9999 }
10000 }
10001 else if( UPP ) {
10002 const size_t iiend( min(i+SIMDSIZE,M) );
10003 for( ; j<i; ++j ) {
10004 for( size_t ii=i; ii<iiend; ++ii ) {
10005 reset( C(ii,j) );
10006 }
10007 }
10008 }
10009
10010 for( ; (j+4UL) <= jend; j+=4UL )
10011 {
10012 const size_t kbegin( ( IsLower_v<MT5> )
10013 ?( ( IsUpper_v<MT4> )
10014 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10015 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10016 :( IsUpper_v<MT4> ? i : 0UL ) );
10017 const size_t kend( ( IsUpper_v<MT5> )
10018 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
10019 :( K ) );
10020
10021 size_t k( kbegin );
10022
10023 if( k < kend )
10024 {
10025 SIMDType a1( A.load(i,k) );
10026 SIMDType xmm1( a1 * set( B(k,j ) ) );
10027 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
10028 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
10029 SIMDType xmm4( a1 * set( B(k,j+3UL) ) );
10030
10031 for( ++k; k<kend; ++k ) {
10032 a1 = A.load(i,k);
10033 xmm1 += a1 * set( B(k,j ) );
10034 xmm2 += a1 * set( B(k,j+1UL) );
10035 xmm3 += a1 * set( B(k,j+2UL) );
10036 xmm4 += a1 * set( B(k,j+3UL) );
10037 }
10038
10039 C.store( i, j , xmm1 * factor );
10040 C.store( i, j+1UL, xmm2 * factor );
10041 C.store( i, j+2UL, xmm3 * factor );
10042 C.store( i, j+3UL, xmm4 * factor );
10043 }
10044 else
10045 {
10046 const SIMDType zero;
10047 C.store( i, j , zero );
10048 C.store( i, j+1UL, zero );
10049 C.store( i, j+2UL, zero );
10050 C.store( i, j+3UL, zero );
10051 }
10052 }
10053
10054 for( ; (j+3UL) <= jend; j+=3UL )
10055 {
10056 const size_t kbegin( ( IsLower_v<MT5> )
10057 ?( ( IsUpper_v<MT4> )
10058 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10059 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10060 :( IsUpper_v<MT4> ? i : 0UL ) );
10061 const size_t kend( ( IsUpper_v<MT5> )
10062 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
10063 :( K ) );
10064
10065 size_t k( kbegin );
10066
10067 if( k < kend )
10068 {
10069 SIMDType a1( A.load(i,k) );
10070 SIMDType xmm1( a1 * set( B(k,j ) ) );
10071 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
10072 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
10073
10074 for( ++k; k<kend; ++k ) {
10075 a1 = A.load(i,k);
10076 xmm1 += a1 * set( B(k,j ) );
10077 xmm2 += a1 * set( B(k,j+1UL) );
10078 xmm3 += a1 * set( B(k,j+2UL) );
10079 }
10080
10081 C.store( i, j , xmm1 * factor );
10082 C.store( i, j+1UL, xmm2 * factor );
10083 C.store( i, j+2UL, xmm3 * factor );
10084 }
10085 else
10086 {
10087 const SIMDType zero;
10088 C.store( i, j , zero );
10089 C.store( i, j+1UL, zero );
10090 C.store( i, j+2UL, zero );
10091 }
10092 }
10093
10094 for( ; (j+2UL) <= jend; j+=2UL )
10095 {
10096 const size_t kbegin( ( IsLower_v<MT5> )
10097 ?( ( IsUpper_v<MT4> )
10098 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10099 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10100 :( IsUpper_v<MT4> ? i : 0UL ) );
10101 const size_t kend( ( IsUpper_v<MT5> )
10102 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
10103 :( K ) );
10104
10105 size_t k( kbegin );
10106
10107 if( k < kend )
10108 {
10109 SIMDType a1( A.load(i,k) );
10110 SIMDType xmm1( a1 * set( B(k,j ) ) );
10111 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
10112
10113 for( ++k; k<kend; ++k ) {
10114 a1 = A.load(i,k);
10115 xmm1 += a1 * set( B(k,j ) );
10116 xmm2 += a1 * set( B(k,j+1UL) );
10117 }
10118
10119 C.store( i, j , xmm1 * factor );
10120 C.store( i, j+1UL, xmm2 * factor );
10121 }
10122 else
10123 {
10124 const SIMDType zero;
10125 C.store( i, j , zero );
10126 C.store( i, j+1UL, zero );
10127 }
10128 }
10129
10130 if( j < jend )
10131 {
10132 const size_t kbegin( ( IsLower_v<MT5> )
10133 ?( ( IsUpper_v<MT4> )
10134 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10135 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10136 :( IsUpper_v<MT4> ? i : 0UL ) );
10137
10138 size_t k( kbegin );
10139
10140 if( k < K )
10141 {
10142 SIMDType xmm1( A.load(i,k) * set( B(k,j) ) );
10143
10144 for( ++k; k<K; ++k ) {
10145 xmm1 += A.load(i,k) * set( B(k,j) );
10146 }
10147
10148 C.store( i, j, xmm1 * factor );
10149 }
10150 else
10151 {
10152 const SIMDType zero;
10153 C.store( i, j, zero );
10154 }
10155
10156 if( LOW ) ++j;
10157 }
10158
10159 if( LOW ) {
10160 const size_t iiend( min(i+SIMDSIZE,M) );
10161 for( ; j<N; ++j ) {
10162 for( size_t ii=i; ii<iiend; ++ii ) {
10163 reset( C(ii,j) );
10164 }
10165 }
10166 }
10167 }
10168
10169 for( ; remainder && i<M; ++i )
10170 {
10171 size_t j( 0UL );
10172
10173 if( SYM || HERM ) {
10174 for( ; j<i; ++j ) {
10175 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
10176 }
10177 }
10178 else if( UPP ) {
10179 for( ; j<i; ++j ) {
10180 reset( C(i,j) );
10181 }
10182 }
10183
10184 for( ; (j+2UL) <= N; j+=2UL )
10185 {
10186 const size_t kbegin( ( IsLower_v<MT5> )
10187 ?( ( IsUpper_v<MT4> )
10188 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10189 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10190 :( IsUpper_v<MT4> ? i : 0UL ) );
10191 const size_t kend( ( IsUpper_v<MT5> )
10192 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
10193 :( K ) );
10194
10195 size_t k( kbegin );
10196
10197 if( k < kend )
10198 {
10199 ElementType value1( A(i,k) * B(k,j ) );
10200 ElementType value2( A(i,k) * B(k,j+1UL) );
10201
10202 for( ++k; k<kend; ++k ) {
10203 value1 += A(i,k) * B(k,j );
10204 value2 += A(i,k) * B(k,j+1UL);
10205 }
10206
10207 C(i,j ) = value1 * scalar;
10208 C(i,j+1UL) = value2 * scalar;
10209 }
10210 else
10211 {
10212 reset( C(i,j ) );
10213 reset( C(i,j+1UL) );
10214 }
10215 }
10216
10217 if( j < N )
10218 {
10219 const size_t kbegin( ( IsLower_v<MT5> )
10220 ?( ( IsUpper_v<MT4> )
10221 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10222 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10223 :( IsUpper_v<MT4> ? i : 0UL ) );
10224
10225 size_t k( kbegin );
10226
10227 if( k < K )
10228 {
10229 ElementType value( A(i,k) * B(k,j) );
10230
10231 for( ++k; k<K; ++k ) {
10232 value += A(i,k) * B(k,j);
10233 }
10234
10235 C(i,j) = value * scalar;
10236 }
10237 else
10238 {
10239 reset( C(i,j) );
10240 }
10241 }
10242 }
10243 }
10244 //**********************************************************************************************
10245
10246 //**Default assignment to dense matrices (large matrices)***************************************
10260 template< typename MT3 // Type of the left-hand side target matrix
10261 , typename MT4 // Type of the left-hand side matrix operand
10262 , typename MT5 // Type of the right-hand side matrix operand
10263 , typename ST2 > // Type of the scalar value
10264 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10265 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10266 {
10267 selectDefaultAssignKernel( C, A, B, scalar );
10268 }
10269 //**********************************************************************************************
10270
10271 //**Vectorized default assignment to dense matrices (large matrices)****************************
10286 template< typename MT3 // Type of the left-hand side target matrix
10287 , typename MT4 // Type of the left-hand side matrix operand
10288 , typename MT5 // Type of the right-hand side matrix operand
10289 , typename ST2 > // Type of the scalar value
10290 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10291 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10292 {
10293 if( SYM )
10294 smmm( C, A, B, scalar );
10295 else if( HERM )
10296 hmmm( C, A, B, scalar );
10297 else if( LOW )
10298 lmmm( C, A, B, scalar, ST2(0) );
10299 else if( UPP )
10300 ummm( C, A, B, scalar, ST2(0) );
10301 else
10302 mmm( C, A, B, scalar, ST2(0) );
10303 }
10304 //**********************************************************************************************
10305
10306 //**BLAS-based assignment to dense matrices (default)*******************************************
10320 template< typename MT3 // Type of the left-hand side target matrix
10321 , typename MT4 // Type of the left-hand side matrix operand
10322 , typename MT5 // Type of the right-hand side matrix operand
10323 , typename ST2 > // Type of the scalar value
10324 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10325 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
10326 {
10327 selectLargeAssignKernel( C, A, B, scalar );
10328 }
10329 //**********************************************************************************************
10330
10331 //**BLAS-based assignment to dense matrices*****************************************************
10332#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
10346 template< typename MT3 // Type of the left-hand side target matrix
10347 , typename MT4 // Type of the left-hand side matrix operand
10348 , typename MT5 // Type of the right-hand side matrix operand
10349 , typename ST2 > // Type of the scalar value
10350 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10351 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
10352 {
10353 using ET = ElementType_t<MT3>;
10354
10355 if( IsTriangular_v<MT4> ) {
10356 assign( C, B );
10357 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
10358 }
10359 else if( IsTriangular_v<MT5> ) {
10360 assign( C, A );
10361 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
10362 }
10363 else {
10364 gemm( C, A, B, ET(scalar), ET(0) );
10365 }
10366 }
10367#endif
10368 //**********************************************************************************************
10369
10370 //**Assignment to sparse matrices***************************************************************
10382 template< typename MT // Type of the target sparse matrix
10383 , bool SO > // Storage order of the target sparse matrix
10384 friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
10385 {
10387
10388 using TmpType = If_t< SO, ResultType, OppositeType >;
10389
10396
10397 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
10398 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
10399
10400 const ForwardFunctor fwd;
10401
10402 const TmpType tmp( serial( rhs ) );
10403 assign( *lhs, fwd( tmp ) );
10404 }
10405 //**********************************************************************************************
10406
10407 //**Addition assignment to dense matrices*******************************************************
10419 template< typename MT // Type of the target dense matrix
10420 , bool SO > // Storage order of the target dense matrix
10421 friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
10422 {
10424
10425 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
10426 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
10427
10428 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
10429 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
10430
10431 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
10432 return;
10433 }
10434
10435 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
10436 RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
10437
10438 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
10439 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
10440 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
10441 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
10442 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
10443 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
10444
10445 DMatScalarMultExpr::selectAddAssignKernel( *lhs, A, B, rhs.scalar_ );
10446 }
10447 //**********************************************************************************************
10448
10449 //**Addition assignment to dense matrices (kernel selection)************************************
10460 template< typename MT3 // Type of the left-hand side target matrix
10461 , typename MT4 // Type of the left-hand side matrix operand
10462 , typename MT5 // Type of the right-hand side matrix operand
10463 , typename ST2 > // Type of the scalar value
10464 static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10465 {
10466 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
10467 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
10468 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
10469 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
10470 selectSmallAddAssignKernel( C, A, B, scalar );
10471 else
10472 selectBlasAddAssignKernel( C, A, B, scalar );
10473 }
10474 //**********************************************************************************************
10475
10476 //**Default addition assignment to dense matrices (general/general)*****************************
10490 template< typename MT3 // Type of the left-hand side target matrix
10491 , typename MT4 // Type of the left-hand side matrix operand
10492 , typename MT5 // Type of the right-hand side matrix operand
10493 , typename ST2 > // Type of the scalar value
10494 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10495 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
10496 {
10497 const ResultType tmp( serial( A * B * scalar ) );
10498 addAssign( C, tmp );
10499 }
10500 //**********************************************************************************************
10501
10502 //**Default addition assignment to row-major dense matrices (general/diagonal)******************
10516 template< typename MT3 // Type of the left-hand side target matrix
10517 , typename MT4 // Type of the left-hand side matrix operand
10518 , typename MT5 // Type of the right-hand side matrix operand
10519 , typename ST2 > // Type of the scalar value
10520 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10521 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
10522 {
10523 constexpr size_t block( BLOCK_SIZE );
10524
10525 const size_t M( A.rows() );
10526 const size_t N( B.columns() );
10527
10528 for( size_t ii=0UL; ii<M; ii+=block ) {
10529 const size_t iend( min( M, ii+block ) );
10530 for( size_t jj=0UL; jj<N; jj+=block ) {
10531 const size_t jend( min( N, jj+block ) );
10532 for( size_t i=ii; i<iend; ++i )
10533 {
10534 const size_t jbegin( ( IsUpper_v<MT4> )
10535 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
10536 :( jj ) );
10537 const size_t jpos( ( IsLower_v<MT4> )
10538 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
10539 :( jend ) );
10540
10541 for( size_t j=jbegin; j<jpos; ++j ) {
10542 C(i,j) += A(i,j) * B(j,j) * scalar;
10543 }
10544 }
10545 }
10546 }
10547 }
10548 //**********************************************************************************************
10549
10550 //**Default addition assignment to column-major dense matrices (general/diagonal)***************
10564 template< typename MT3 // Type of the left-hand side target matrix
10565 , typename MT4 // Type of the left-hand side matrix operand
10566 , typename MT5 // Type of the right-hand side matrix operand
10567 , typename ST2 > // Type of the scalar value
10568 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10569 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
10570 {
10571 const size_t M( A.rows() );
10572 const size_t N( B.columns() );
10573
10574 for( size_t j=0UL; j<N; ++j )
10575 {
10576 const size_t ibegin( ( IsLower_v<MT4> )
10577 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
10578 :( 0UL ) );
10579 const size_t iend( ( IsUpper_v<MT4> )
10580 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
10581 :( M ) );
10582 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
10583
10584 const size_t inum( iend - ibegin );
10585 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
10586 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
10587
10588 for( size_t i=ibegin; i<ipos; i+=2UL ) {
10589 C(i ,j) += A(i ,j) * B(j,j) * scalar;
10590 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
10591 }
10592 if( ipos < iend ) {
10593 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
10594 }
10595 }
10596 }
10597 //**********************************************************************************************
10598
10599 //**Default addition assignment to row-major dense matrices (diagonal/general)******************
10613 template< typename MT3 // Type of the left-hand side target matrix
10614 , typename MT4 // Type of the left-hand side matrix operand
10615 , typename MT5 // Type of the right-hand side matrix operand
10616 , typename ST2 > // Type of the scalar value
10617 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10618 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
10619 {
10620 const size_t M( A.rows() );
10621 const size_t N( B.columns() );
10622
10623 for( size_t i=0UL; i<M; ++i )
10624 {
10625 const size_t jbegin( ( IsUpper_v<MT5> )
10626 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
10627 :( 0UL ) );
10628 const size_t jend( ( IsLower_v<MT5> )
10629 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
10630 :( N ) );
10631 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
10632
10633 const size_t jnum( jend - jbegin );
10634 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
10635 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
10636
10637 for( size_t j=jbegin; j<jpos; j+=2UL ) {
10638 C(i,j ) += A(i,i) * B(i,j ) * scalar;
10639 C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
10640 }
10641 if( jpos < jend ) {
10642 C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
10643 }
10644 }
10645 }
10646 //**********************************************************************************************
10647
10648 //**Default addition assignment to column-major dense matrices (diagonal/general)***************
10662 template< typename MT3 // Type of the left-hand side target matrix
10663 , typename MT4 // Type of the left-hand side matrix operand
10664 , typename MT5 // Type of the right-hand side matrix operand
10665 , typename ST2 > // Type of the scalar value
10666 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10667 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
10668 {
10669 constexpr size_t block( BLOCK_SIZE );
10670
10671 const size_t M( A.rows() );
10672 const size_t N( B.columns() );
10673
10674 for( size_t jj=0UL; jj<N; jj+=block ) {
10675 const size_t jend( min( N, jj+block ) );
10676 for( size_t ii=0UL; ii<M; ii+=block ) {
10677 const size_t iend( min( M, ii+block ) );
10678 for( size_t j=jj; j<jend; ++j )
10679 {
10680 const size_t ibegin( ( IsLower_v<MT5> )
10681 ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
10682 :( ii ) );
10683 const size_t ipos( ( IsUpper_v<MT5> )
10684 ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
10685 :( iend ) );
10686
10687 for( size_t i=ibegin; i<ipos; ++i ) {
10688 C(i,j) += A(i,i) * B(i,j) * scalar;
10689 }
10690 }
10691 }
10692 }
10693 }
10694 //**********************************************************************************************
10695
10696 //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
10710 template< typename MT3 // Type of the left-hand side target matrix
10711 , typename MT4 // Type of the left-hand side matrix operand
10712 , typename MT5 // Type of the right-hand side matrix operand
10713 , typename ST2 > // Type of the scalar value
10714 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10715 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
10716 {
10717 for( size_t i=0UL; i<A.rows(); ++i ) {
10718 C(i,i) += A(i,i) * B(i,i) * scalar;
10719 }
10720 }
10721 //**********************************************************************************************
10722
10723 //**Default addition assignment to dense matrices (small matrices)******************************
10737 template< typename MT3 // Type of the left-hand side target matrix
10738 , typename MT4 // Type of the left-hand side matrix operand
10739 , typename MT5 // Type of the right-hand side matrix operand
10740 , typename ST2 > // Type of the scalar value
10741 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10742 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10743 {
10744 selectDefaultAddAssignKernel( C, A, B, scalar );
10745 }
10746 //**********************************************************************************************
10747
10748 //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
10763 template< typename MT3 // Type of the left-hand side target matrix
10764 , typename MT4 // Type of the left-hand side matrix operand
10765 , typename MT5 // Type of the right-hand side matrix operand
10766 , typename ST2 > // Type of the scalar value
10767 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10768 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10769 {
10770 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
10771
10772 const size_t M( A.rows() );
10773 const size_t N( B.columns() );
10774 const size_t K( A.columns() );
10775
10776 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
10777
10778 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
10779 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
10780
10781 const SIMDType factor( set( scalar ) );
10782
10783 size_t j( 0UL );
10784
10785 if( IsIntegral_v<ElementType> )
10786 {
10787 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
10788 for( size_t i=0UL; i<M; ++i )
10789 {
10790 const size_t kbegin( ( IsUpper_v<MT4> )
10791 ?( ( IsLower_v<MT5> )
10792 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10793 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10794 :( IsLower_v<MT5> ? j : 0UL ) );
10795 const size_t kend( ( IsLower_v<MT4> )
10796 ?( ( IsUpper_v<MT5> )
10797 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
10798 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
10799 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
10800
10801 size_t k( kbegin );
10802
10803 if( k < kend )
10804 {
10805 SIMDType a1( set( A(i,k) ) );
10806 SIMDType xmm1( a1 * B.load(k,j ) );
10807 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
10808 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
10809 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
10810 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
10811 SIMDType xmm6( a1 * B.load(k,j+SIMDSIZE*5UL) );
10812 SIMDType xmm7( a1 * B.load(k,j+SIMDSIZE*6UL) );
10813 SIMDType xmm8( a1 * B.load(k,j+SIMDSIZE*7UL) );
10814
10815 for( ++k; k<kend; ++k ) {
10816 a1 = set( A(i,k) );
10817 xmm1 += a1 * B.load(k,j );
10818 xmm2 += a1 * B.load(k,j+SIMDSIZE );
10819 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
10820 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
10821 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
10822 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
10823 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
10824 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
10825 }
10826
10827 C.store( i, j , C.load(i,j ) + xmm1 * factor );
10828 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
10829 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
10830 C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
10831 C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
10832 C.store( i, j+SIMDSIZE*5UL, C.load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
10833 C.store( i, j+SIMDSIZE*6UL, C.load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
10834 C.store( i, j+SIMDSIZE*7UL, C.load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
10835 }
10836 }
10837 }
10838 }
10839
10840 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
10841 {
10842 size_t i( 0UL );
10843
10844 for( ; (i+2UL) <= M; i+=2UL )
10845 {
10846 const size_t kbegin( ( IsUpper_v<MT4> )
10847 ?( ( IsLower_v<MT5> )
10848 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10849 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10850 :( IsLower_v<MT5> ? j : 0UL ) );
10851 const size_t kend( ( IsLower_v<MT4> )
10852 ?( ( IsUpper_v<MT5> )
10853 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
10854 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10855 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
10856
10857 size_t k( kbegin );
10858
10859 if( k < kend )
10860 {
10861 SIMDType a1( set( A(i ,k) ) );
10862 SIMDType a2( set( A(i+1UL,k) ) );
10863 SIMDType b1( B.load(k,j ) );
10864 SIMDType b2( B.load(k,j+SIMDSIZE ) );
10865 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
10866 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
10867 SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
10868 SIMDType xmm1 ( a1 * b1 );
10869 SIMDType xmm2 ( a1 * b2 );
10870 SIMDType xmm3 ( a1 * b3 );
10871 SIMDType xmm4 ( a1 * b4 );
10872 SIMDType xmm5 ( a1 * b5 );
10873 SIMDType xmm6 ( a2 * b1 );
10874 SIMDType xmm7 ( a2 * b2 );
10875 SIMDType xmm8 ( a2 * b3 );
10876 SIMDType xmm9 ( a2 * b4 );
10877 SIMDType xmm10( a2 * b5 );
10878
10879 for( ++k; k<kend; ++k ) {
10880 a1 = set( A(i ,k) );
10881 a2 = set( A(i+1UL,k) );
10882 b1 = B.load(k,j );
10883 b2 = B.load(k,j+SIMDSIZE );
10884 b3 = B.load(k,j+SIMDSIZE*2UL);
10885 b4 = B.load(k,j+SIMDSIZE*3UL);
10886 b5 = B.load(k,j+SIMDSIZE*4UL);
10887 xmm1 += a1 * b1;
10888 xmm2 += a1 * b2;
10889 xmm3 += a1 * b3;
10890 xmm4 += a1 * b4;
10891 xmm5 += a1 * b5;
10892 xmm6 += a2 * b1;
10893 xmm7 += a2 * b2;
10894 xmm8 += a2 * b3;
10895 xmm9 += a2 * b4;
10896 xmm10 += a2 * b5;
10897 }
10898
10899 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10900 C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
10901 C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
10902 C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
10903 C.store( i , j+SIMDSIZE*4UL, C.load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
10904 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm6 * factor );
10905 C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
10906 C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
10907 C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
10908 C.store( i+1UL, j+SIMDSIZE*4UL, C.load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
10909 }
10910 }
10911
10912 if( i < M )
10913 {
10914 const size_t kbegin( ( IsUpper_v<MT4> )
10915 ?( ( IsLower_v<MT5> )
10916 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10917 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10918 :( IsLower_v<MT5> ? j : 0UL ) );
10919 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
10920
10921 size_t k( kbegin );
10922
10923 if( k < kend )
10924 {
10925 SIMDType a1( set( A(i,k) ) );
10926 SIMDType xmm1( a1 * B.load(k,j ) );
10927 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
10928 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
10929 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
10930 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
10931
10932 for( ++k; k<kend; ++k ) {
10933 a1 = set( A(i,k) );
10934 xmm1 += a1 * B.load(k,j );
10935 xmm2 += a1 * B.load(k,j+SIMDSIZE );
10936 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
10937 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
10938 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
10939 }
10940
10941 C.store( i, j , C.load(i,j ) + xmm1 * factor );
10942 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
10943 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
10944 C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
10945 C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
10946 }
10947 }
10948 }
10949
10950 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
10951 {
10952 size_t i( 0UL );
10953
10954 for( ; (i+2UL) <= M; i+=2UL )
10955 {
10956 const size_t kbegin( ( IsUpper_v<MT4> )
10957 ?( ( IsLower_v<MT5> )
10958 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10959 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10960 :( IsLower_v<MT5> ? j : 0UL ) );
10961 const size_t kend( ( IsLower_v<MT4> )
10962 ?( ( IsUpper_v<MT5> )
10963 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
10964 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10965 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
10966
10967 size_t k( kbegin );
10968
10969 if( k < kend )
10970 {
10971 SIMDType a1( set( A(i ,k) ) );
10972 SIMDType a2( set( A(i+1UL,k) ) );
10973 SIMDType b1( B.load(k,j ) );
10974 SIMDType b2( B.load(k,j+SIMDSIZE ) );
10975 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
10976 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
10977 SIMDType xmm1( a1 * b1 );
10978 SIMDType xmm2( a1 * b2 );
10979 SIMDType xmm3( a1 * b3 );
10980 SIMDType xmm4( a1 * b4 );
10981 SIMDType xmm5( a2 * b1 );
10982 SIMDType xmm6( a2 * b2 );
10983 SIMDType xmm7( a2 * b3 );
10984 SIMDType xmm8( a2 * b4 );
10985
10986 for( ++k; k<kend; ++k ) {
10987 a1 = set( A(i ,k) );
10988 a2 = set( A(i+1UL,k) );
10989 b1 = B.load(k,j );
10990 b2 = B.load(k,j+SIMDSIZE );
10991 b3 = B.load(k,j+SIMDSIZE*2UL);
10992 b4 = B.load(k,j+SIMDSIZE*3UL);
10993 xmm1 += a1 * b1;
10994 xmm2 += a1 * b2;
10995 xmm3 += a1 * b3;
10996 xmm4 += a1 * b4;
10997 xmm5 += a2 * b1;
10998 xmm6 += a2 * b2;
10999 xmm7 += a2 * b3;
11000 xmm8 += a2 * b4;
11001 }
11002
11003 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11004 C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
11005 C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
11006 C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
11007 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm5 * factor );
11008 C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
11009 C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
11010 C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
11011 }
11012 }
11013
11014 if( i < M )
11015 {
11016 const size_t kbegin( ( IsUpper_v<MT4> )
11017 ?( ( IsLower_v<MT5> )
11018 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11019 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11020 :( IsLower_v<MT5> ? j : 0UL ) );
11021 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
11022
11023 size_t k( kbegin );
11024
11025 if( k < kend )
11026 {
11027 SIMDType a1( set( A(i,k) ) );
11028 SIMDType xmm1( a1 * B.load(k,j ) );
11029 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
11030 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
11031 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
11032
11033 for( ++k; k<kend; ++k ) {
11034 a1 = set( A(i,k) );
11035 xmm1 += a1 * B.load(k,j );
11036 xmm2 += a1 * B.load(k,j+SIMDSIZE );
11037 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11038 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11039 }
11040
11041 C.store( i, j , C.load(i,j ) + xmm1 * factor );
11042 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
11043 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
11044 C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
11045 }
11046 }
11047 }
11048
11049 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
11050 {
11051 size_t i( 0UL );
11052
11053 for( ; (i+2UL) <= M; i+=2UL )
11054 {
11055 const size_t kbegin( ( IsUpper_v<MT4> )
11056 ?( ( IsLower_v<MT5> )
11057 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11058 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11059 :( IsLower_v<MT5> ? j : 0UL ) );
11060 const size_t kend( ( IsLower_v<MT4> )
11061 ?( ( IsUpper_v<MT5> )
11062 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
11063 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11064 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
11065
11066 size_t k( kbegin );
11067
11068 if( k < kend )
11069 {
11070 SIMDType a1( set( A(i ,k) ) );
11071 SIMDType a2( set( A(i+1UL,k) ) );
11072 SIMDType b1( B.load(k,j ) );
11073 SIMDType b2( B.load(k,j+SIMDSIZE ) );
11074 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11075 SIMDType xmm1( a1 * b1 );
11076 SIMDType xmm2( a1 * b2 );
11077 SIMDType xmm3( a1 * b3 );
11078 SIMDType xmm4( a2 * b1 );
11079 SIMDType xmm5( a2 * b2 );
11080 SIMDType xmm6( a2 * b3 );
11081
11082 for( ++k; k<kend; ++k ) {
11083 a1 = set( A(i ,k) );
11084 a2 = set( A(i+1UL,k) );
11085 b1 = B.load(k,j );
11086 b2 = B.load(k,j+SIMDSIZE );
11087 b3 = B.load(k,j+SIMDSIZE*2UL);
11088 xmm1 += a1 * b1;
11089 xmm2 += a1 * b2;
11090 xmm3 += a1 * b3;
11091 xmm4 += a2 * b1;
11092 xmm5 += a2 * b2;
11093 xmm6 += a2 * b3;
11094 }
11095
11096 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11097 C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
11098 C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
11099 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm4 * factor );
11100 C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
11101 C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
11102 }
11103 }
11104
11105 if( i < M )
11106 {
11107 const size_t kbegin( ( IsUpper_v<MT4> )
11108 ?( ( IsLower_v<MT5> )
11109 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11110 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11111 :( IsLower_v<MT5> ? j : 0UL ) );
11112 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
11113
11114 size_t k( kbegin );
11115
11116 if( k < kend )
11117 {
11118 SIMDType a1( set( A(i,k) ) );
11119 SIMDType xmm1( a1 * B.load(k,j ) );
11120 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
11121 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
11122
11123 for( ++k; k<kend; ++k ) {
11124 a1 = set( A(i,k) );
11125 xmm1 += a1 * B.load(k,j );
11126 xmm2 += a1 * B.load(k,j+SIMDSIZE );
11127 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11128 }
11129
11130 C.store( i, j , C.load(i,j ) + xmm1 * factor );
11131 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
11132 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
11133 }
11134 }
11135 }
11136
11137 for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
11138 {
11139 const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
11140 size_t i( LOW ? j : 0UL );
11141
11142 for( ; (i+4UL) <= iend; i+=4UL )
11143 {
11144 const size_t kbegin( ( IsUpper_v<MT4> )
11145 ?( ( IsLower_v<MT5> )
11146 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11147 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11148 :( IsLower_v<MT5> ? j : 0UL ) );
11149 const size_t kend( ( IsLower_v<MT4> )
11150 ?( ( IsUpper_v<MT5> )
11151 ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
11152 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
11153 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
11154
11155 size_t k( kbegin );
11156
11157 if( k < kend )
11158 {
11159 SIMDType a1( set( A(i ,k) ) );
11160 SIMDType a2( set( A(i+1UL,k) ) );
11161 SIMDType a3( set( A(i+2UL,k) ) );
11162 SIMDType a4( set( A(i+3UL,k) ) );
11163 SIMDType b1( B.load(k,j ) );
11164 SIMDType b2( B.load(k,j+SIMDSIZE) );
11165 SIMDType xmm1( a1 * b1 );
11166 SIMDType xmm2( a1 * b2 );
11167 SIMDType xmm3( a2 * b1 );
11168 SIMDType xmm4( a2 * b2 );
11169 SIMDType xmm5( a3 * b1 );
11170 SIMDType xmm6( a3 * b2 );
11171 SIMDType xmm7( a4 * b1 );
11172 SIMDType xmm8( a4 * b2 );
11173
11174 for( ++k; k<kend; ++k ) {
11175 a1 = set( A(i ,k) );
11176 a2 = set( A(i+1UL,k) );
11177 a3 = set( A(i+2UL,k) );
11178 a4 = set( A(i+3UL,k) );
11179 b1 = B.load(k,j );
11180 b2 = B.load(k,j+SIMDSIZE);
11181 xmm1 += a1 * b1;
11182 xmm2 += a1 * b2;
11183 xmm3 += a2 * b1;
11184 xmm4 += a2 * b2;
11185 xmm5 += a3 * b1;
11186 xmm6 += a3 * b2;
11187 xmm7 += a4 * b1;
11188 xmm8 += a4 * b2;
11189 }
11190
11191 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11192 C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
11193 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
11194 C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
11195 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
11196 C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
11197 C.store( i+3UL, j , C.load(i+3UL,j ) + xmm7 * factor );
11198 C.store( i+3UL, j+SIMDSIZE, C.load(i+3UL,j+SIMDSIZE) + xmm8 * factor );
11199 }
11200 }
11201
11202 for( ; (i+3UL) <= iend; i+=3UL )
11203 {
11204 const size_t kbegin( ( IsUpper_v<MT4> )
11205 ?( ( IsLower_v<MT5> )
11206 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11207 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11208 :( IsLower_v<MT5> ? j : 0UL ) );
11209 const size_t kend( ( IsLower_v<MT4> )
11210 ?( ( IsUpper_v<MT5> )
11211 ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
11212 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
11213 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
11214
11215 size_t k( kbegin );
11216
11217 if( k < kend )
11218 {
11219 SIMDType a1( set( A(i ,k) ) );
11220 SIMDType a2( set( A(i+1UL,k) ) );
11221 SIMDType a3( set( A(i+2UL,k) ) );
11222 SIMDType b1( B.load(k,j ) );
11223 SIMDType b2( B.load(k,j+SIMDSIZE) );
11224 SIMDType xmm1( a1 * b1 );
11225 SIMDType xmm2( a1 * b2 );
11226 SIMDType xmm3( a2 * b1 );
11227 SIMDType xmm4( a2 * b2 );
11228 SIMDType xmm5( a3 * b1 );
11229 SIMDType xmm6( a3 * b2 );
11230
11231 for( ++k; k<kend; ++k ) {
11232 a1 = set( A(i ,k) );
11233 a2 = set( A(i+1UL,k) );
11234 a3 = set( A(i+2UL,k) );
11235 b1 = B.load(k,j );
11236 b2 = B.load(k,j+SIMDSIZE);
11237 xmm1 += a1 * b1;
11238 xmm2 += a1 * b2;
11239 xmm3 += a2 * b1;
11240 xmm4 += a2 * b2;
11241 xmm5 += a3 * b1;
11242 xmm6 += a3 * b2;
11243 }
11244
11245 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11246 C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
11247 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
11248 C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
11249 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
11250 C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
11251 }
11252 }
11253
11254 for( ; (i+2UL) <= iend; i+=2UL )
11255 {
11256 const size_t kbegin( ( IsUpper_v<MT4> )
11257 ?( ( IsLower_v<MT5> )
11258 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11259 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11260 :( IsLower_v<MT5> ? j : 0UL ) );
11261 const size_t kend( ( IsLower_v<MT4> )
11262 ?( ( IsUpper_v<MT5> )
11263 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
11264 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11265 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
11266
11267 size_t k( kbegin );
11268
11269 if( k < kend )
11270 {
11271 SIMDType a1( set( A(i ,k) ) );
11272 SIMDType a2( set( A(i+1UL,k) ) );
11273 SIMDType b1( B.load(k,j ) );
11274 SIMDType b2( B.load(k,j+SIMDSIZE) );
11275 SIMDType xmm1( a1 * b1 );
11276 SIMDType xmm2( a1 * b2 );
11277 SIMDType xmm3( a2 * b1 );
11278 SIMDType xmm4( a2 * b2 );
11279
11280 for( ++k; k<kend; ++k ) {
11281 a1 = set( A(i ,k) );
11282 a2 = set( A(i+1UL,k) );
11283 b1 = B.load(k,j );
11284 b2 = B.load(k,j+SIMDSIZE);
11285 xmm1 += a1 * b1;
11286 xmm2 += a1 * b2;
11287 xmm3 += a2 * b1;
11288 xmm4 += a2 * b2;
11289 }
11290
11291 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11292 C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
11293 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
11294 C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
11295 }
11296 }
11297
11298 if( i < iend )
11299 {
11300 const size_t kbegin( ( IsUpper_v<MT4> )
11301 ?( ( IsLower_v<MT5> )
11302 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11303 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11304 :( IsLower_v<MT5> ? j : 0UL ) );
11305 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
11306
11307 size_t k( kbegin );
11308
11309 if( k < kend )
11310 {
11311 SIMDType a1( set( A(i,k) ) );
11312 SIMDType xmm1( a1 * B.load(k,j ) );
11313 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE) );
11314
11315 for( ++k; k<kend; ++k ) {
11316 a1 = set( A(i,k) );
11317 xmm1 += a1 * B.load(k,j );
11318 xmm2 += a1 * B.load(k,j+SIMDSIZE);
11319 }
11320
11321 C.store( i, j , C.load(i,j ) + xmm1 * factor );
11322 C.store( i, j+SIMDSIZE, C.load(i,j+SIMDSIZE) + xmm2 * factor );
11323 }
11324 }
11325 }
11326
11327 for( ; j<jpos; j+=SIMDSIZE )
11328 {
11329 const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
11330 size_t i( LOW ? j : 0UL );
11331
11332 for( ; (i+4UL) <= iend; i+=4UL )
11333 {
11334 const size_t kbegin( ( IsUpper_v<MT4> )
11335 ?( ( IsLower_v<MT5> )
11336 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11337 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11338 :( IsLower_v<MT5> ? j : 0UL ) );
11339 const size_t kend( ( IsLower_v<MT4> )
11340 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
11341 :( K ) );
11342
11343 size_t k( kbegin );
11344
11345 if( k < kend )
11346 {
11347 SIMDType b1( B.load(k,j) );
11348 SIMDType xmm1( set( A(i ,k) ) * b1 );
11349 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
11350 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
11351 SIMDType xmm4( set( A(i+3UL,k) ) * b1 );
11352
11353 for( ++k; k<kend; ++k ) {
11354 b1 = B.load(k,j);
11355 xmm1 += set( A(i ,k) ) * b1;
11356 xmm2 += set( A(i+1UL,k) ) * b1;
11357 xmm3 += set( A(i+2UL,k) ) * b1;
11358 xmm4 += set( A(i+3UL,k) ) * b1;
11359 }
11360
11361 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11362 C.store( i+1UL, j, C.load(i+1UL,j) + xmm2 * factor );
11363 C.store( i+2UL, j, C.load(i+2UL,j) + xmm3 * factor );
11364 C.store( i+3UL, j, C.load(i+3UL,j) + xmm4 * factor );
11365 }
11366 }
11367
11368 for( ; (i+3UL) <= iend; i+=3UL )
11369 {
11370 const size_t kbegin( ( IsUpper_v<MT4> )
11371 ?( ( IsLower_v<MT5> )
11372 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11373 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11374 :( IsLower_v<MT5> ? j : 0UL ) );
11375 const size_t kend( ( IsLower_v<MT4> )
11376 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
11377 :( K ) );
11378
11379 size_t k( kbegin );
11380
11381 if( k < kend )
11382 {
11383 SIMDType b1( B.load(k,j) );
11384 SIMDType xmm1( set( A(i ,k) ) * b1 );
11385 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
11386 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
11387
11388 for( ++k; k<kend; ++k ) {
11389 b1 = B.load(k,j);
11390 xmm1 += set( A(i ,k) ) * b1;
11391 xmm2 += set( A(i+1UL,k) ) * b1;
11392 xmm3 += set( A(i+2UL,k) ) * b1;
11393 }
11394
11395 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11396 C.store( i+1UL, j, C.load(i+1UL,j) + xmm2 * factor );
11397 C.store( i+2UL, j, C.load(i+2UL,j) + xmm3 * factor );
11398 }
11399 }
11400
11401 for( ; (i+2UL) <= iend; i+=2UL )
11402 {
11403 const size_t kbegin( ( IsUpper_v<MT4> )
11404 ?( ( IsLower_v<MT5> )
11405 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11406 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11407 :( IsLower_v<MT5> ? j : 0UL ) );
11408 const size_t kend( ( IsLower_v<MT4> )
11409 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
11410 :( K ) );
11411
11412 size_t k( kbegin );
11413
11414 if( k < kend )
11415 {
11416 SIMDType b1( B.load(k,j) );
11417 SIMDType xmm1( set( A(i ,k) ) * b1 );
11418 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
11419
11420 for( ++k; k<kend; ++k ) {
11421 b1 = B.load(k,j);
11422 xmm1 += set( A(i ,k) ) * b1;
11423 xmm2 += set( A(i+1UL,k) ) * b1;
11424 }
11425
11426 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11427 C.store( i+1UL, j, C.load(i+1UL,j) + xmm2 * factor );
11428 }
11429 }
11430
11431 if( i < iend )
11432 {
11433 const size_t kbegin( ( IsUpper_v<MT4> )
11434 ?( ( IsLower_v<MT5> )
11435 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11436 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11437 :( IsLower_v<MT5> ? j : 0UL ) );
11438
11439 size_t k( kbegin );
11440
11441 if( k < K )
11442 {
11443 SIMDType xmm1( set( A(i,k) ) * B.load(k,j) );
11444
11445 for( ++k; k<K; ++k ) {
11446 xmm1 += set( A(i,k) ) * B.load(k,j);
11447 }
11448
11449 C.store( i, j, C.load(i,j) + xmm1 * factor );
11450 }
11451 }
11452 }
11453
11454 for( ; remainder && j<N; ++j )
11455 {
11456 const size_t iend( UPP ? j+1UL : M );
11457 size_t i( LOW ? j : 0UL );
11458
11459 for( ; (i+2UL) <= iend; i+=2UL )
11460 {
11461 const size_t kbegin( ( IsUpper_v<MT4> )
11462 ?( ( IsLower_v<MT5> )
11463 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11464 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11465 :( IsLower_v<MT5> ? j : 0UL ) );
11466 const size_t kend( ( IsLower_v<MT4> )
11467 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
11468 :( K ) );
11469
11470 size_t k( kbegin );
11471
11472 if( k < kend )
11473 {
11474 ElementType value1( A(i ,k) * B(k,j) );
11475 ElementType value2( A(i+1UL,k) * B(k,j) );
11476
11477 for( ++k; k<kend; ++k ) {
11478 value1 += A(i ,k) * B(k,j);
11479 value2 += A(i+1UL,k) * B(k,j);
11480 }
11481
11482 C(i ,j) += value1 * scalar;
11483 C(i+1UL,j) += value2 * scalar;
11484 }
11485 }
11486
11487 if( i < iend )
11488 {
11489 const size_t kbegin( ( IsUpper_v<MT4> )
11490 ?( ( IsLower_v<MT5> )
11491 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11492 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11493 :( IsLower_v<MT5> ? j : 0UL ) );
11494
11495 size_t k( kbegin );
11496
11497 if( k < K )
11498 {
11499 ElementType value( A(i,k) * B(k,j) );
11500
11501 for( ++k; k<K; ++k ) {
11502 value += A(i,k) * B(k,j);
11503 }
11504
11505 C(i,j) += value * scalar;
11506 }
11507 }
11508 }
11509 }
11510 //**********************************************************************************************
11511
11512 //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
11527 template< typename MT3 // Type of the left-hand side target matrix
11528 , typename MT4 // Type of the left-hand side matrix operand
11529 , typename MT5 // Type of the right-hand side matrix operand
11530 , typename ST2 > // Type of the scalar value
11531 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11532 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11533 {
11534 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
11535
11536 const size_t M( A.rows() );
11537 const size_t N( B.columns() );
11538 const size_t K( A.columns() );
11539
11540 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
11541
11542 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
11543 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
11544
11545 const SIMDType factor( set( scalar ) );
11546
11547 size_t i( 0UL );
11548
11549 if( IsIntegral_v<ElementType> )
11550 {
11551 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
11552 for( size_t j=0UL; j<N; ++j )
11553 {
11554 const size_t kbegin( ( IsLower_v<MT5> )
11555 ?( ( IsUpper_v<MT4> )
11556 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11557 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11558 :( IsUpper_v<MT4> ? i : 0UL ) );
11559 const size_t kend( ( IsUpper_v<MT5> )
11560 ?( ( IsLower_v<MT4> )
11561 ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
11562 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
11563 :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
11564
11565 size_t k( kbegin );
11566
11567 if( k < kend )
11568 {
11569 SIMDType b1( set( B(k,j) ) );
11570 SIMDType xmm1( A.load(i ,k) * b1 );
11571 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
11572 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
11573 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
11574 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
11575 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,k) * b1 );
11576 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,k) * b1 );
11577 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,k) * b1 );
11578
11579 for( ++k; k<kend; ++k ) {
11580 b1 = set( B(k,j) );
11581 xmm1 += A.load(i ,k) * b1;
11582 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
11583 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
11584 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
11585 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
11586 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
11587 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
11588 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
11589 }
11590
11591 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11592 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
11593 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
11594 C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
11595 C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
11596 C.store( i+SIMDSIZE*5UL, j, C.load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
11597 C.store( i+SIMDSIZE*6UL, j, C.load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
11598 C.store( i+SIMDSIZE*7UL, j, C.load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
11599 }
11600 }
11601 }
11602 }
11603
11604 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
11605 {
11606 size_t j( 0UL );
11607
11608 for( ; (j+2UL) <= N; j+=2UL )
11609 {
11610 const size_t kbegin( ( IsLower_v<MT5> )
11611 ?( ( IsUpper_v<MT4> )
11612 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11613 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11614 :( IsUpper_v<MT4> ? i : 0UL ) );
11615 const size_t kend( ( IsUpper_v<MT5> )
11616 ?( ( IsLower_v<MT4> )
11617 ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
11618 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
11619 :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
11620
11621 size_t k( kbegin );
11622
11623 if( k < kend )
11624 {
11625 SIMDType a1( A.load(i ,k) );
11626 SIMDType a2( A.load(i+SIMDSIZE ,k) );
11627 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
11628 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
11629 SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
11630 SIMDType b1( set( B(k,j ) ) );
11631 SIMDType b2( set( B(k,j+1UL) ) );
11632 SIMDType xmm1 ( a1 * b1 );
11633 SIMDType xmm2 ( a2 * b1 );
11634 SIMDType xmm3 ( a3 * b1 );
11635 SIMDType xmm4 ( a4 * b1 );
11636 SIMDType xmm5 ( a5 * b1 );
11637 SIMDType xmm6 ( a1 * b2 );
11638 SIMDType xmm7 ( a2 * b2 );
11639 SIMDType xmm8 ( a3 * b2 );
11640 SIMDType xmm9 ( a4 * b2 );
11641 SIMDType xmm10( a5 * b2 );
11642
11643 for( ++k; k<kend; ++k ) {
11644 a1 = A.load(i ,k);
11645 a2 = A.load(i+SIMDSIZE ,k);
11646 a3 = A.load(i+SIMDSIZE*2UL,k);
11647 a4 = A.load(i+SIMDSIZE*3UL,k);
11648 a5 = A.load(i+SIMDSIZE*4UL,k);
11649 b1 = set( B(k,j ) );
11650 b2 = set( B(k,j+1UL) );
11651 xmm1 += a1 * b1;
11652 xmm2 += a2 * b1;
11653 xmm3 += a3 * b1;
11654 xmm4 += a4 * b1;
11655 xmm5 += a5 * b1;
11656 xmm6 += a1 * b2;
11657 xmm7 += a2 * b2;
11658 xmm8 += a3 * b2;
11659 xmm9 += a4 * b2;
11660 xmm10 += a5 * b2;
11661 }
11662
11663 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11664 C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
11665 C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
11666 C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
11667 C.store( i+SIMDSIZE*4UL, j , C.load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
11668 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm6 * factor );
11669 C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
11670 C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
11671 C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
11672 C.store( i+SIMDSIZE*4UL, j+1UL, C.load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
11673 }
11674 }
11675
11676 if( j < N )
11677 {
11678 const size_t kbegin( ( IsLower_v<MT5> )
11679 ?( ( IsUpper_v<MT4> )
11680 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11681 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11682 :( IsUpper_v<MT4> ? i : 0UL ) );
11683 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
11684
11685 size_t k( kbegin );
11686
11687 if( k < kend )
11688 {
11689 SIMDType b1( set( B(k,j) ) );
11690 SIMDType xmm1( A.load(i ,k) * b1 );
11691 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
11692 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
11693 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
11694 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
11695
11696 for( ++k; k<kend; ++k ) {
11697 b1 = set( B(k,j) );
11698 xmm1 += A.load(i ,k) * b1;
11699 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
11700 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
11701 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
11702 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
11703 }
11704
11705 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11706 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
11707 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
11708 C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
11709 C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
11710 }
11711 }
11712 }
11713
11714 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
11715 {
11716 size_t j( 0UL );
11717
11718 for( ; (j+2UL) <= N; j+=2UL )
11719 {
11720 const size_t kbegin( ( IsLower_v<MT5> )
11721 ?( ( IsUpper_v<MT4> )
11722 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11723 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11724 :( IsUpper_v<MT4> ? i : 0UL ) );
11725 const size_t kend( ( IsUpper_v<MT5> )
11726 ?( ( IsLower_v<MT4> )
11727 ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
11728 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
11729 :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
11730
11731 size_t k( kbegin );
11732
11733 if( k < kend )
11734 {
11735 SIMDType a1( A.load(i ,k) );
11736 SIMDType a2( A.load(i+SIMDSIZE ,k) );
11737 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
11738 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
11739 SIMDType b1( set( B(k,j ) ) );
11740 SIMDType b2( set( B(k,j+1UL) ) );
11741 SIMDType xmm1( a1 * b1 );
11742 SIMDType xmm2( a2 * b1 );
11743 SIMDType xmm3( a3 * b1 );
11744 SIMDType xmm4( a4 * b1 );
11745 SIMDType xmm5( a1 * b2 );
11746 SIMDType xmm6( a2 * b2 );
11747 SIMDType xmm7( a3 * b2 );
11748 SIMDType xmm8( a4 * b2 );
11749
11750 for( ++k; k<kend; ++k ) {
11751 a1 = A.load(i ,k);
11752 a2 = A.load(i+SIMDSIZE ,k);
11753 a3 = A.load(i+SIMDSIZE*2UL,k);
11754 a4 = A.load(i+SIMDSIZE*3UL,k);
11755 b1 = set( B(k,j ) );
11756 b2 = set( B(k,j+1UL) );
11757 xmm1 += a1 * b1;
11758 xmm2 += a2 * b1;
11759 xmm3 += a3 * b1;
11760 xmm4 += a4 * b1;
11761 xmm5 += a1 * b2;
11762 xmm6 += a2 * b2;
11763 xmm7 += a3 * b2;
11764 xmm8 += a4 * b2;
11765 }
11766
11767 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11768 C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
11769 C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
11770 C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
11771 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm5 * factor );
11772 C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
11773 C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
11774 C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
11775 }
11776 }
11777
11778 if( j < N )
11779 {
11780 const size_t kbegin( ( IsLower_v<MT5> )
11781 ?( ( IsUpper_v<MT4> )
11782 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11783 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11784 :( IsUpper_v<MT4> ? i : 0UL ) );
11785 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
11786
11787 size_t k( kbegin );
11788
11789 if( k < kend )
11790 {
11791 SIMDType b1( set( B(k,j) ) );
11792 SIMDType xmm1( A.load(i ,k) * b1 );
11793 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
11794 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
11795 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
11796
11797 for( ++k; k<kend; ++k ) {
11798 b1 = set( B(k,j) );
11799 xmm1 += A.load(i ,k) * b1;
11800 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
11801 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
11802 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
11803 }
11804
11805 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11806 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
11807 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
11808 C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
11809 }
11810 }
11811 }
11812
11813 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
11814 {
11815 size_t j( 0UL );
11816
11817 for( ; (j+2UL) <= N; j+=2UL )
11818 {
11819 const size_t kbegin( ( IsLower_v<MT5> )
11820 ?( ( IsUpper_v<MT4> )
11821 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11822 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11823 :( IsUpper_v<MT4> ? i : 0UL ) );
11824 const size_t kend( ( IsUpper_v<MT5> )
11825 ?( ( IsLower_v<MT4> )
11826 ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
11827 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
11828 :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
11829
11830 size_t k( kbegin );
11831
11832 if( k < kend )
11833 {
11834 SIMDType a1( A.load(i ,k) );
11835 SIMDType a2( A.load(i+SIMDSIZE ,k) );
11836 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
11837 SIMDType b1( set( B(k,j ) ) );
11838 SIMDType b2( set( B(k,j+1UL) ) );
11839 SIMDType xmm1( a1 * b1 );
11840 SIMDType xmm2( a2 * b1 );
11841 SIMDType xmm3( a3 * b1 );
11842 SIMDType xmm4( a1 * b2 );
11843 SIMDType xmm5( a2 * b2 );
11844 SIMDType xmm6( a3 * b2 );
11845
11846 for( ++k; k<kend; ++k ) {
11847 a1 = A.load(i ,k);
11848 a2 = A.load(i+SIMDSIZE ,k);
11849 a3 = A.load(i+SIMDSIZE*2UL,k);
11850 b1 = set( B(k,j ) );
11851 b2 = set( B(k,j+1UL) );
11852 xmm1 += a1 * b1;
11853 xmm2 += a2 * b1;
11854 xmm3 += a3 * b1;
11855 xmm4 += a1 * b2;
11856 xmm5 += a2 * b2;
11857 xmm6 += a3 * b2;
11858 }
11859
11860 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11861 C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
11862 C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
11863 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm4 * factor );
11864 C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
11865 C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
11866 }
11867 }
11868
11869 if( j < N )
11870 {
11871 const size_t kbegin( ( IsLower_v<MT5> )
11872 ?( ( IsUpper_v<MT4> )
11873 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11874 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11875 :( IsUpper_v<MT4> ? i : 0UL ) );
11876 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
11877
11878 size_t k( kbegin );
11879
11880 if( k < kend )
11881 {
11882 SIMDType b1( set( B(k,j) ) );
11883 SIMDType xmm1( A.load(i ,k) * b1 );
11884 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
11885 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
11886
11887 for( ++k; k<kend; ++k ) {
11888 b1 = set( B(k,j) );
11889 xmm1 += A.load(i ,k) * b1;
11890 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
11891 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
11892 }
11893
11894 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11895 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
11896 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
11897 }
11898 }
11899 }
11900
11901 for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
11902 {
11903 const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
11904 size_t j( UPP ? i : 0UL );
11905
11906 for( ; (j+4UL) <= jend; j+=4UL )
11907 {
11908 const size_t kbegin( ( IsLower_v<MT5> )
11909 ?( ( IsUpper_v<MT4> )
11910 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11911 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11912 :( IsUpper_v<MT4> ? i : 0UL ) );
11913 const size_t kend( ( IsUpper_v<MT5> )
11914 ?( ( IsLower_v<MT4> )
11915 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
11916 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
11917 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
11918
11919 size_t k( kbegin );
11920
11921 if( k < kend )
11922 {
11923 SIMDType a1( A.load(i ,k) );
11924 SIMDType a2( A.load(i+SIMDSIZE,k) );
11925 SIMDType b1( set( B(k,j ) ) );
11926 SIMDType b2( set( B(k,j+1UL) ) );
11927 SIMDType b3( set( B(k,j+2UL) ) );
11928 SIMDType b4( set( B(k,j+3UL) ) );
11929 SIMDType xmm1( a1 * b1 );
11930 SIMDType xmm2( a2 * b1 );
11931 SIMDType xmm3( a1 * b2 );
11932 SIMDType xmm4( a2 * b2 );
11933 SIMDType xmm5( a1 * b3 );
11934 SIMDType xmm6( a2 * b3 );
11935 SIMDType xmm7( a1 * b4 );
11936 SIMDType xmm8( a2 * b4 );
11937
11938 for( ++k; k<kend; ++k ) {
11939 a1 = A.load(i ,k);
11940 a2 = A.load(i+SIMDSIZE,k);
11941 b1 = set( B(k,j ) );
11942 b2 = set( B(k,j+1UL) );
11943 b3 = set( B(k,j+2UL) );
11944 b4 = set( B(k,j+3UL) );
11945 xmm1 += a1 * b1;
11946 xmm2 += a2 * b1;
11947 xmm3 += a1 * b2;
11948 xmm4 += a2 * b2;
11949 xmm5 += a1 * b3;
11950 xmm6 += a2 * b3;
11951 xmm7 += a1 * b4;
11952 xmm8 += a2 * b4;
11953 }
11954
11955 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11956 C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
11957 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
11958 C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
11959 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
11960 C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
11961 C.store( i , j+3UL, C.load(i ,j+3UL) + xmm7 * factor );
11962 C.store( i+SIMDSIZE, j+3UL, C.load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
11963 }
11964 }
11965
11966 for( ; (j+3UL) <= jend; j+=3UL )
11967 {
11968 const size_t kbegin( ( IsLower_v<MT5> )
11969 ?( ( IsUpper_v<MT4> )
11970 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11971 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11972 :( IsUpper_v<MT4> ? i : 0UL ) );
11973 const size_t kend( ( IsUpper_v<MT5> )
11974 ?( ( IsLower_v<MT4> )
11975 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
11976 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
11977 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
11978
11979 size_t k( kbegin );
11980
11981 if( k < kend )
11982 {
11983 SIMDType a1( A.load(i ,k) );
11984 SIMDType a2( A.load(i+SIMDSIZE,k) );
11985 SIMDType b1( set( B(k,j ) ) );
11986 SIMDType b2( set( B(k,j+1UL) ) );
11987 SIMDType b3( set( B(k,j+2UL) ) );
11988 SIMDType xmm1( a1 * b1 );
11989 SIMDType xmm2( a2 * b1 );
11990 SIMDType xmm3( a1 * b2 );
11991 SIMDType xmm4( a2 * b2 );
11992 SIMDType xmm5( a1 * b3 );
11993 SIMDType xmm6( a2 * b3 );
11994
11995 for( ++k; k<kend; ++k ) {
11996 a1 = A.load(i ,k);
11997 a2 = A.load(i+SIMDSIZE,k);
11998 b1 = set( B(k,j ) );
11999 b2 = set( B(k,j+1UL) );
12000 b3 = set( B(k,j+2UL) );
12001 xmm1 += a1 * b1;
12002 xmm2 += a2 * b1;
12003 xmm3 += a1 * b2;
12004 xmm4 += a2 * b2;
12005 xmm5 += a1 * b3;
12006 xmm6 += a2 * b3;
12007 }
12008
12009 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
12010 C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
12011 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
12012 C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
12013 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
12014 C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
12015 }
12016 }
12017
12018 for( ; (j+2UL) <= jend; j+=2UL )
12019 {
12020 const size_t kbegin( ( IsLower_v<MT5> )
12021 ?( ( IsUpper_v<MT4> )
12022 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12023 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12024 :( IsUpper_v<MT4> ? i : 0UL ) );
12025 const size_t kend( ( IsUpper_v<MT5> )
12026 ?( ( IsLower_v<MT4> )
12027 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12028 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12029 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
12030
12031 size_t k( kbegin );
12032
12033 if( k < kend )
12034 {
12035 SIMDType a1( A.load(i ,k) );
12036 SIMDType a2( A.load(i+SIMDSIZE,k) );
12037 SIMDType b1( set( B(k,j ) ) );
12038 SIMDType b2( set( B(k,j+1UL) ) );
12039 SIMDType xmm1( a1 * b1 );
12040 SIMDType xmm2( a2 * b1 );
12041 SIMDType xmm3( a1 * b2 );
12042 SIMDType xmm4( a2 * b2 );
12043
12044 for( ++k; k<kend; ++k ) {
12045 a1 = A.load(i ,k);
12046 a2 = A.load(i+SIMDSIZE,k);
12047 b1 = set( B(k,j ) );
12048 b2 = set( B(k,j+1UL) );
12049 xmm1 += a1 * b1;
12050 xmm2 += a2 * b1;
12051 xmm3 += a1 * b2;
12052 xmm4 += a2 * b2;
12053 }
12054
12055 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
12056 C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
12057 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
12058 C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
12059 }
12060 }
12061
12062 if( j < jend )
12063 {
12064 const size_t kbegin( ( IsLower_v<MT5> )
12065 ?( ( IsUpper_v<MT4> )
12066 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12067 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12068 :( IsUpper_v<MT4> ? i : 0UL ) );
12069 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
12070
12071 size_t k( kbegin );
12072
12073 if( k < kend )
12074 {
12075 SIMDType b1( set( B(k,j) ) );
12076 SIMDType xmm1( A.load(i ,k) * b1 );
12077 SIMDType xmm2( A.load(i+SIMDSIZE,k) * b1 );
12078
12079 for( ++k; k<kend; ++k ) {
12080 b1 = set( B(k,j) );
12081 xmm1 += A.load(i ,k) * b1;
12082 xmm2 += A.load(i+SIMDSIZE,k) * b1;
12083 }
12084
12085 C.store( i , j, C.load(i ,j) + xmm1 * factor );
12086 C.store( i+SIMDSIZE, j, C.load(i+SIMDSIZE,j) + xmm2 * factor );
12087 }
12088 }
12089 }
12090
12091 for( ; i<ipos; i+=SIMDSIZE )
12092 {
12093 const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
12094 size_t j( UPP ? i : 0UL );
12095
12096 for( ; (j+4UL) <= jend; j+=4UL )
12097 {
12098 const size_t kbegin( ( IsLower_v<MT5> )
12099 ?( ( IsUpper_v<MT4> )
12100 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12101 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12102 :( IsUpper_v<MT4> ? i : 0UL ) );
12103 const size_t kend( ( IsUpper_v<MT5> )
12104 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
12105 :( K ) );
12106
12107 size_t k( kbegin );
12108
12109 if( k < kend )
12110 {
12111 SIMDType a1( A.load(i,k) );
12112 SIMDType xmm1( a1 * set( B(k,j ) ) );
12113 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
12114 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
12115 SIMDType xmm4( a1 * set( B(k,j+3UL) ) );
12116
12117 for( ++k; k<kend; ++k ) {
12118 a1 = A.load(i,k);
12119 xmm1 += a1 * set( B(k,j ) );
12120 xmm2 += a1 * set( B(k,j+1UL) );
12121 xmm3 += a1 * set( B(k,j+2UL) );
12122 xmm4 += a1 * set( B(k,j+3UL) );
12123 }
12124
12125 C.store( i, j , C.load(i,j ) + xmm1 * factor );
12126 C.store( i, j+1UL, C.load(i,j+1UL) + xmm2 * factor );
12127 C.store( i, j+2UL, C.load(i,j+2UL) + xmm3 * factor );
12128 C.store( i, j+3UL, C.load(i,j+3UL) + xmm4 * factor );
12129 }
12130 }
12131
12132 for( ; (j+3UL) <= jend; j+=3UL )
12133 {
12134 const size_t kbegin( ( IsLower_v<MT5> )
12135 ?( ( IsUpper_v<MT4> )
12136 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12137 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12138 :( IsUpper_v<MT4> ? i : 0UL ) );
12139 const size_t kend( ( IsUpper_v<MT5> )
12140 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
12141 :( K ) );
12142
12143 size_t k( kbegin );
12144
12145 if( k < kend )
12146 {
12147 SIMDType a1( A.load(i,k) );
12148 SIMDType xmm1( a1 * set( B(k,j ) ) );
12149 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
12150 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
12151
12152 for( ++k; k<kend; ++k ) {
12153 a1 = A.load(i,k);
12154 xmm1 += a1 * set( B(k,j ) );
12155 xmm2 += a1 * set( B(k,j+1UL) );
12156 xmm3 += a1 * set( B(k,j+2UL) );
12157 }
12158
12159 C.store( i, j , C.load(i,j ) + xmm1 * factor );
12160 C.store( i, j+1UL, C.load(i,j+1UL) + xmm2 * factor );
12161 C.store( i, j+2UL, C.load(i,j+2UL) + xmm3 * factor );
12162 }
12163 }
12164
12165 for( ; (j+2UL) <= jend; j+=2UL )
12166 {
12167 const size_t kbegin( ( IsLower_v<MT5> )
12168 ?( ( IsUpper_v<MT4> )
12169 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12170 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12171 :( IsUpper_v<MT4> ? i : 0UL ) );
12172 const size_t kend( ( IsUpper_v<MT5> )
12173 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
12174 :( K ) );
12175
12176 size_t k( kbegin );
12177
12178 if( k < kend )
12179 {
12180 SIMDType a1( A.load(i,k) );
12181 SIMDType xmm1( a1 * set( B(k,j ) ) );
12182 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
12183
12184 for( ++k; k<kend; ++k ) {
12185 a1 = A.load(i,k);
12186 xmm1 += a1 * set( B(k,j ) );
12187 xmm2 += a1 * set( B(k,j+1UL) );
12188 }
12189
12190 C.store( i, j , C.load(i,j ) + xmm1 * factor );
12191 C.store( i, j+1UL, C.load(i,j+1UL) + xmm2 * factor );
12192 }
12193 }
12194
12195 if( j < jend )
12196 {
12197 const size_t kbegin( ( IsLower_v<MT5> )
12198 ?( ( IsUpper_v<MT4> )
12199 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12200 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12201 :( IsUpper_v<MT4> ? i : 0UL ) );
12202
12203 size_t k( kbegin );
12204
12205 if( k < K )
12206 {
12207 SIMDType xmm1( A.load(i,k) * set( B(k,j) ) );
12208
12209 for( ++k; k<K; ++k ) {
12210 xmm1 += A.load(i,k) * set( B(k,j) );
12211 }
12212
12213 C.store( i, j, C.load(i,j) + xmm1 * factor );
12214 }
12215 }
12216 }
12217
12218 for( ; remainder && i<M; ++i )
12219 {
12220 const size_t jend( LOW ? i+1UL : N );
12221 size_t j( UPP ? i : 0UL );
12222
12223 for( ; (j+2UL) <= jend; j+=2UL )
12224 {
12225 const size_t kbegin( ( IsLower_v<MT5> )
12226 ?( ( IsUpper_v<MT4> )
12227 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12228 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12229 :( IsUpper_v<MT4> ? i : 0UL ) );
12230 const size_t kend( ( IsUpper_v<MT5> )
12231 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
12232 :( K ) );
12233
12234 size_t k( kbegin );
12235
12236 if( k < kend )
12237 {
12238 ElementType value1( A(i,k) * B(k,j ) );
12239 ElementType value2( A(i,k) * B(k,j+1UL) );
12240
12241 for( ++k; k<kend; ++k ) {
12242 value1 += A(i,k) * B(k,j );
12243 value2 += A(i,k) * B(k,j+1UL);
12244 }
12245
12246 C(i,j ) += value1 * scalar;
12247 C(i,j+1UL) += value2 * scalar;
12248 }
12249 }
12250
12251 if( j < jend )
12252 {
12253 const size_t kbegin( ( IsLower_v<MT5> )
12254 ?( ( IsUpper_v<MT4> )
12255 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12256 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12257 :( IsUpper_v<MT4> ? i : 0UL ) );
12258
12259 size_t k( kbegin );
12260
12261 if( k < K )
12262 {
12263 ElementType value( A(i,k) * B(k,j) );
12264
12265 for( ++k; k<K; ++k ) {
12266 value += A(i,k) * B(k,j);
12267 }
12268
12269 C(i,j) += value * scalar;
12270 }
12271 }
12272 }
12273 }
12274 //**********************************************************************************************
12275
12276 //**Default addition assignment to dense matrices (large matrices)******************************
12290 template< typename MT3 // Type of the left-hand side target matrix
12291 , typename MT4 // Type of the left-hand side matrix operand
12292 , typename MT5 // Type of the right-hand side matrix operand
12293 , typename ST2 > // Type of the scalar value
12294 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12295 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12296 {
12297 selectDefaultAddAssignKernel( C, A, B, scalar );
12298 }
12299 //**********************************************************************************************
12300
12301 //**Vectorized default addition assignment to dense matrices (large matrices)*******************
12316 template< typename MT3 // Type of the left-hand side target matrix
12317 , typename MT4 // Type of the left-hand side matrix operand
12318 , typename MT5 // Type of the right-hand side matrix operand
12319 , typename ST2 > // Type of the scalar value
12320 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12321 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12322 {
12323 if( LOW )
12324 lmmm( C, A, B, scalar, ST2(1) );
12325 else if( UPP )
12326 ummm( C, A, B, scalar, ST2(1) );
12327 else
12328 mmm( C, A, B, scalar, ST2(1) );
12329 }
12330 //**********************************************************************************************
12331
12332 //**BLAS-based addition assignment to dense matrices (default)**********************************
12346 template< typename MT3 // Type of the left-hand side target matrix
12347 , typename MT4 // Type of the left-hand side matrix operand
12348 , typename MT5 // Type of the right-hand side matrix operand
12349 , typename ST2 > // Type of the scalar value
12350 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12351 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
12352 {
12353 selectLargeAddAssignKernel( C, A, B, scalar );
12354 }
12355 //**********************************************************************************************
12356
12357 //**BLAS-based addition assignment to dense matrices********************************************
12358#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
12372 template< typename MT3 // Type of the left-hand side target matrix
12373 , typename MT4 // Type of the left-hand side matrix operand
12374 , typename MT5 // Type of the right-hand side matrix operand
12375 , typename ST2 > // Type of the scalar value
12376 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12377 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
12378 {
12379 using ET = ElementType_t<MT3>;
12380
12381 if( IsTriangular_v<MT4> ) {
12382 ResultType_t<MT3> tmp( serial( B ) );
12383 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
12384 addAssign( C, tmp );
12385 }
12386 else if( IsTriangular_v<MT5> ) {
12387 ResultType_t<MT3> tmp( serial( A ) );
12388 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
12389 addAssign( C, tmp );
12390 }
12391 else {
12392 gemm( C, A, B, ET(scalar), ET(1) );
12393 }
12394 }
12395#endif
12396 //**********************************************************************************************
12397
12398 //**Addition assignment to sparse matrices******************************************************
12399 // No special implementation for the addition assignment to sparse matrices.
12400 //**********************************************************************************************
12401
12402 //**Subtraction assignment to dense matrices****************************************************
12414 template< typename MT // Type of the target dense matrix
12415 , bool SO > // Storage order of the target dense matrix
12416 friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
12417 {
12419
12420 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
12421 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
12422
12423 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
12424 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
12425
12426 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
12427 return;
12428 }
12429
12430 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
12431 RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
12432
12433 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
12434 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
12435 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
12436 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
12437 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
12438 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
12439
12440 DMatScalarMultExpr::selectSubAssignKernel( *lhs, A, B, rhs.scalar_ );
12441 }
12442 //**********************************************************************************************
12443
12444 //**Subtraction assignment to dense matrices (kernel selection)*********************************
12455 template< typename MT3 // Type of the left-hand side target matrix
12456 , typename MT4 // Type of the left-hand side matrix operand
12457 , typename MT5 // Type of the right-hand side matrix operand
12458 , typename ST2 > // Type of the scalar value
12459 static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12460 {
12461 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
12462 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
12463 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
12464 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
12465 selectSmallSubAssignKernel( C, A, B, scalar );
12466 else
12467 selectBlasSubAssignKernel( C, A, B, scalar );
12468 }
12469 //**********************************************************************************************
12470
12471 //**Default subtraction assignment to dense matrices********************************************
12485 template< typename MT3 // Type of the left-hand side target matrix
12486 , typename MT4 // Type of the left-hand side matrix operand
12487 , typename MT5 // Type of the right-hand side matrix operand
12488 , typename ST2 > // Type of the scalar value
12489 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12490 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
12491 {
12492 const ResultType tmp( serial( A * B * scalar ) );
12493 subAssign( C, tmp );
12494 }
12495 //**********************************************************************************************
12496
12497 //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
12511 template< typename MT3 // Type of the left-hand side target matrix
12512 , typename MT4 // Type of the left-hand side matrix operand
12513 , typename MT5 // Type of the right-hand side matrix operand
12514 , typename ST2 > // Type of the scalar value
12515 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12516 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
12517 {
12518 constexpr size_t block( BLOCK_SIZE );
12519
12520 const size_t M( A.rows() );
12521 const size_t N( B.columns() );
12522
12523 for( size_t ii=0UL; ii<M; ii+=block ) {
12524 const size_t iend( min( M, ii+block ) );
12525 for( size_t jj=0UL; jj<N; jj+=block ) {
12526 const size_t jend( min( N, jj+block ) );
12527 for( size_t i=ii; i<iend; ++i )
12528 {
12529 const size_t jbegin( ( IsUpper_v<MT4> )
12530 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
12531 :( jj ) );
12532 const size_t jpos( ( IsLower_v<MT4> )
12533 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
12534 :( jend ) );
12535
12536 for( size_t j=jbegin; j<jpos; ++j ) {
12537 C(i,j) -= A(i,j) * B(j,j) * scalar;
12538 }
12539 }
12540 }
12541 }
12542 }
12543 //**********************************************************************************************
12544
12545 //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
12559 template< typename MT3 // Type of the left-hand side target matrix
12560 , typename MT4 // Type of the left-hand side matrix operand
12561 , typename MT5 // Type of the right-hand side matrix operand
12562 , typename ST2 > // Type of the scalar value
12563 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12564 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
12565 {
12566 const size_t M( A.rows() );
12567 const size_t N( B.columns() );
12568
12569 for( size_t j=0UL; j<N; ++j )
12570 {
12571 const size_t ibegin( ( IsLower_v<MT4> )
12572 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
12573 :( 0UL ) );
12574 const size_t iend( ( IsUpper_v<MT4> )
12575 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
12576 :( M ) );
12577 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
12578
12579 const size_t inum( iend - ibegin );
12580 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
12581 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
12582
12583 for( size_t i=ibegin; i<ipos; i+=2UL ) {
12584 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
12585 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
12586 }
12587 if( ipos < iend ) {
12588 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
12589 }
12590 }
12591 }
12592 //**********************************************************************************************
12593
12594 //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
12608 template< typename MT3 // Type of the left-hand side target matrix
12609 , typename MT4 // Type of the left-hand side matrix operand
12610 , typename MT5 // Type of the right-hand side matrix operand
12611 , typename ST2 > // Type of the scalar value
12612 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12613 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
12614 {
12615 const size_t M( A.rows() );
12616 const size_t N( B.columns() );
12617
12618 for( size_t i=0UL; i<M; ++i )
12619 {
12620 const size_t jbegin( ( IsUpper_v<MT5> )
12621 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
12622 :( 0UL ) );
12623 const size_t jend( ( IsLower_v<MT5> )
12624 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
12625 :( N ) );
12626 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
12627
12628 const size_t jnum( jend - jbegin );
12629 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
12630 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
12631
12632 for( size_t j=jbegin; j<jpos; j+=2UL ) {
12633 C(i,j ) -= A(i,i) * B(i,j ) * scalar;
12634 C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
12635 }
12636 if( jpos < jend ) {
12637 C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
12638 }
12639 }
12640 }
12641 //**********************************************************************************************
12642
12643 //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
12657 template< typename MT3 // Type of the left-hand side target matrix
12658 , typename MT4 // Type of the left-hand side matrix operand
12659 , typename MT5 // Type of the right-hand side matrix operand
12660 , typename ST2 > // Type of the scalar value
12661 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12662 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
12663 {
12664 constexpr size_t block( BLOCK_SIZE );
12665
12666 const size_t M( A.rows() );
12667 const size_t N( B.columns() );
12668
12669 for( size_t jj=0UL; jj<N; jj+=block ) {
12670 const size_t jend( min( N, jj+block ) );
12671 for( size_t ii=0UL; ii<M; ii+=block ) {
12672 const size_t iend( min( M, ii+block ) );
12673 for( size_t j=jj; j<jend; ++j )
12674 {
12675 const size_t ibegin( ( IsLower_v<MT5> )
12676 ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
12677 :( ii ) );
12678 const size_t ipos( ( IsUpper_v<MT5> )
12679 ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
12680 :( iend ) );
12681
12682 for( size_t i=ibegin; i<ipos; ++i ) {
12683 C(i,j) -= A(i,i) * B(i,j) * scalar;
12684 }
12685 }
12686 }
12687 }
12688 }
12689 //**********************************************************************************************
12690
12691 //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
12705 template< typename MT3 // Type of the left-hand side target matrix
12706 , typename MT4 // Type of the left-hand side matrix operand
12707 , typename MT5 // Type of the right-hand side matrix operand
12708 , typename ST2 > // Type of the scalar value
12709 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12710 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
12711 {
12712 for( size_t i=0UL; i<A.rows(); ++i ) {
12713 C(i,i) -= A(i,i) * B(i,i) * scalar;
12714 }
12715 }
12716 //**********************************************************************************************
12717
12718 //**Default subtraction assignment to dense matrices (small matrices)***************************
12732 template< typename MT3 // Type of the left-hand side target matrix
12733 , typename MT4 // Type of the left-hand side matrix operand
12734 , typename MT5 // Type of the right-hand side matrix operand
12735 , typename ST2 > // Type of the scalar value
12736 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12737 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12738 {
12739 selectDefaultSubAssignKernel( C, A, B, scalar );
12740 }
12741 //**********************************************************************************************
12742
12743 //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
12758 template< typename MT3 // Type of the left-hand side target matrix
12759 , typename MT4 // Type of the left-hand side matrix operand
12760 , typename MT5 // Type of the right-hand side matrix operand
12761 , typename ST2 > // Type of the scalar value
12762 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12763 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12764 {
12765 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
12766
12767 const size_t M( A.rows() );
12768 const size_t N( B.columns() );
12769 const size_t K( A.columns() );
12770
12771 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
12772
12773 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
12774 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
12775
12776 const SIMDType factor( set( scalar ) );
12777
12778 size_t j( 0UL );
12779
12780 if( IsIntegral_v<ElementType> )
12781 {
12782 for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
12783 for( size_t i=0UL; i<M; ++i )
12784 {
12785 const size_t kbegin( ( IsUpper_v<MT4> )
12786 ?( ( IsLower_v<MT5> )
12787 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12788 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12789 :( IsLower_v<MT5> ? j : 0UL ) );
12790 const size_t kend( ( IsLower_v<MT4> )
12791 ?( ( IsUpper_v<MT5> )
12792 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
12793 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
12794 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
12795
12796 size_t k( kbegin );
12797
12798 if( k < kend )
12799 {
12800 SIMDType a1( set( A(i,k) ) );
12801 SIMDType xmm1( a1 * B.load(k,j ) );
12802 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
12803 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
12804 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
12805 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
12806 SIMDType xmm6( a1 * B.load(k,j+SIMDSIZE*5UL) );
12807 SIMDType xmm7( a1 * B.load(k,j+SIMDSIZE*6UL) );
12808 SIMDType xmm8( a1 * B.load(k,j+SIMDSIZE*7UL) );
12809
12810 for( ++k; k<kend; ++k ) {
12811 a1 = set( A(i,k) );
12812 xmm1 += a1 * B.load(k,j );
12813 xmm2 += a1 * B.load(k,j+SIMDSIZE );
12814 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
12815 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
12816 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
12817 xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
12818 xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
12819 xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
12820 }
12821
12822 C.store( i, j , C.load(i,j ) - xmm1 * factor );
12823 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
12824 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
12825 C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
12826 C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
12827 C.store( i, j+SIMDSIZE*5UL, C.load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
12828 C.store( i, j+SIMDSIZE*6UL, C.load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
12829 C.store( i, j+SIMDSIZE*7UL, C.load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
12830 }
12831 }
12832 }
12833 }
12834
12835 for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
12836 {
12837 size_t i( 0UL );
12838
12839 for( ; (i+2UL) <= M; i+=2UL )
12840 {
12841 const size_t kbegin( ( IsUpper_v<MT4> )
12842 ?( ( IsLower_v<MT5> )
12843 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12844 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12845 :( IsLower_v<MT5> ? j : 0UL ) );
12846 const size_t kend( ( IsLower_v<MT4> )
12847 ?( ( IsUpper_v<MT5> )
12848 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
12849 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
12850 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
12851
12852 size_t k( kbegin );
12853
12854 if( k < kend )
12855 {
12856 SIMDType a1( set( A(i ,k) ) );
12857 SIMDType a2( set( A(i+1UL,k) ) );
12858 SIMDType b1( B.load(k,j ) );
12859 SIMDType b2( B.load(k,j+SIMDSIZE ) );
12860 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
12861 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
12862 SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
12863 SIMDType xmm1 ( a1 * b1 );
12864 SIMDType xmm2 ( a1 * b2 );
12865 SIMDType xmm3 ( a1 * b3 );
12866 SIMDType xmm4 ( a1 * b4 );
12867 SIMDType xmm5 ( a1 * b5 );
12868 SIMDType xmm6 ( a2 * b1 );
12869 SIMDType xmm7 ( a2 * b2 );
12870 SIMDType xmm8 ( a2 * b3 );
12871 SIMDType xmm9 ( a2 * b4 );
12872 SIMDType xmm10( a2 * b5 );
12873
12874 for( ++k; k<kend; ++k ) {
12875 a1 = set( A(i ,k) );
12876 a2 = set( A(i+1UL,k) );
12877 b1 = B.load(k,j );
12878 b2 = B.load(k,j+SIMDSIZE );
12879 b3 = B.load(k,j+SIMDSIZE*2UL);
12880 b4 = B.load(k,j+SIMDSIZE*3UL);
12881 b5 = B.load(k,j+SIMDSIZE*4UL);
12882 xmm1 += a1 * b1;
12883 xmm2 += a1 * b2;
12884 xmm3 += a1 * b3;
12885 xmm4 += a1 * b4;
12886 xmm5 += a1 * b5;
12887 xmm6 += a2 * b1;
12888 xmm7 += a2 * b2;
12889 xmm8 += a2 * b3;
12890 xmm9 += a2 * b4;
12891 xmm10 += a2 * b5;
12892 }
12893
12894 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12895 C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
12896 C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
12897 C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
12898 C.store( i , j+SIMDSIZE*4UL, C.load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
12899 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm6 * factor );
12900 C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
12901 C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
12902 C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
12903 C.store( i+1UL, j+SIMDSIZE*4UL, C.load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
12904 }
12905 }
12906
12907 if( i < M )
12908 {
12909 const size_t kbegin( ( IsUpper_v<MT4> )
12910 ?( ( IsLower_v<MT5> )
12911 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12912 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12913 :( IsLower_v<MT5> ? j : 0UL ) );
12914 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
12915
12916 size_t k( kbegin );
12917
12918 if( k < kend )
12919 {
12920 SIMDType a1( set( A(i,k) ) );
12921 SIMDType xmm1( a1 * B.load(k,j ) );
12922 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
12923 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
12924 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
12925 SIMDType xmm5( a1 * B.load(k,j+SIMDSIZE*4UL) );
12926
12927 for( ++k; k<kend; ++k ) {
12928 a1 = set( A(i,k) );
12929 xmm1 += a1 * B.load(k,j );
12930 xmm2 += a1 * B.load(k,j+SIMDSIZE );
12931 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
12932 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
12933 xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
12934 }
12935
12936 C.store( i, j , C.load(i,j ) - xmm1 * factor );
12937 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
12938 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
12939 C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
12940 C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
12941 }
12942 }
12943 }
12944
12945 for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
12946 {
12947 size_t i( 0UL );
12948
12949 for( ; (i+2UL) <= M; i+=2UL )
12950 {
12951 const size_t kbegin( ( IsUpper_v<MT4> )
12952 ?( ( IsLower_v<MT5> )
12953 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12954 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12955 :( IsLower_v<MT5> ? j : 0UL ) );
12956 const size_t kend( ( IsLower_v<MT4> )
12957 ?( ( IsUpper_v<MT5> )
12958 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
12959 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
12960 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
12961
12962 size_t k( kbegin );
12963
12964 if( k < kend )
12965 {
12966 SIMDType a1( set( A(i ,k) ) );
12967 SIMDType a2( set( A(i+1UL,k) ) );
12968 SIMDType b1( B.load(k,j ) );
12969 SIMDType b2( B.load(k,j+SIMDSIZE ) );
12970 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
12971 SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
12972 SIMDType xmm1( a1 * b1 );
12973 SIMDType xmm2( a1 * b2 );
12974 SIMDType xmm3( a1 * b3 );
12975 SIMDType xmm4( a1 * b4 );
12976 SIMDType xmm5( a2 * b1 );
12977 SIMDType xmm6( a2 * b2 );
12978 SIMDType xmm7( a2 * b3 );
12979 SIMDType xmm8( a2 * b4 );
12980
12981 for( ++k; k<kend; ++k ) {
12982 a1 = set( A(i ,k) );
12983 a2 = set( A(i+1UL,k) );
12984 b1 = B.load(k,j );
12985 b2 = B.load(k,j+SIMDSIZE );
12986 b3 = B.load(k,j+SIMDSIZE*2UL);
12987 b4 = B.load(k,j+SIMDSIZE*3UL);
12988 xmm1 += a1 * b1;
12989 xmm2 += a1 * b2;
12990 xmm3 += a1 * b3;
12991 xmm4 += a1 * b4;
12992 xmm5 += a2 * b1;
12993 xmm6 += a2 * b2;
12994 xmm7 += a2 * b3;
12995 xmm8 += a2 * b4;
12996 }
12997
12998 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12999 C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
13000 C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
13001 C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
13002 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm5 * factor );
13003 C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
13004 C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
13005 C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
13006 }
13007 }
13008
13009 if( i < M )
13010 {
13011 const size_t kbegin( ( IsUpper_v<MT4> )
13012 ?( ( IsLower_v<MT5> )
13013 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13014 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13015 :( IsLower_v<MT5> ? j : 0UL ) );
13016 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
13017
13018 size_t k( kbegin );
13019
13020 if( k < kend )
13021 {
13022 SIMDType a1( set( A(i,k) ) );
13023 SIMDType xmm1( a1 * B.load(k,j ) );
13024 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
13025 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
13026 SIMDType xmm4( a1 * B.load(k,j+SIMDSIZE*3UL) );
13027
13028 for( ++k; k<kend; ++k ) {
13029 a1 = set( A(i,k) );
13030 xmm1 += a1 * B.load(k,j );
13031 xmm2 += a1 * B.load(k,j+SIMDSIZE );
13032 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
13033 xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
13034 }
13035
13036 C.store( i, j , C.load(i,j ) - xmm1 * factor );
13037 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
13038 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
13039 C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
13040 }
13041 }
13042 }
13043
13044 for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
13045 {
13046 size_t i( 0UL );
13047
13048 for( ; (i+2UL) <= M; i+=2UL )
13049 {
13050 const size_t kbegin( ( IsUpper_v<MT4> )
13051 ?( ( IsLower_v<MT5> )
13052 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13053 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13054 :( IsLower_v<MT5> ? j : 0UL ) );
13055 const size_t kend( ( IsLower_v<MT4> )
13056 ?( ( IsUpper_v<MT5> )
13057 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
13058 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
13059 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
13060
13061 size_t k( kbegin );
13062
13063 if( k < kend )
13064 {
13065 SIMDType a1( set( A(i ,k) ) );
13066 SIMDType a2( set( A(i+1UL,k) ) );
13067 SIMDType b1( B.load(k,j ) );
13068 SIMDType b2( B.load(k,j+SIMDSIZE ) );
13069 SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
13070 SIMDType xmm1( a1 * b1 );
13071 SIMDType xmm2( a1 * b2 );
13072 SIMDType xmm3( a1 * b3 );
13073 SIMDType xmm4( a2 * b1 );
13074 SIMDType xmm5( a2 * b2 );
13075 SIMDType xmm6( a2 * b3 );
13076
13077 for( ++k; k<kend; ++k ) {
13078 a1 = set( A(i ,k) );
13079 a2 = set( A(i+1UL,k) );
13080 b1 = B.load(k,j );
13081 b2 = B.load(k,j+SIMDSIZE );
13082 b3 = B.load(k,j+SIMDSIZE*2UL);
13083 xmm1 += a1 * b1;
13084 xmm2 += a1 * b2;
13085 xmm3 += a1 * b3;
13086 xmm4 += a2 * b1;
13087 xmm5 += a2 * b2;
13088 xmm6 += a2 * b3;
13089 }
13090
13091 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13092 C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
13093 C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
13094 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm4 * factor );
13095 C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
13096 C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
13097 }
13098 }
13099
13100 if( i < M )
13101 {
13102 const size_t kbegin( ( IsUpper_v<MT4> )
13103 ?( ( IsLower_v<MT5> )
13104 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13105 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13106 :( IsLower_v<MT5> ? j : 0UL ) );
13107 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
13108
13109 size_t k( kbegin );
13110
13111 if( k < kend )
13112 {
13113 SIMDType a1( set( A(i,k) ) );
13114 SIMDType xmm1( a1 * B.load(k,j ) );
13115 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE ) );
13116 SIMDType xmm3( a1 * B.load(k,j+SIMDSIZE*2UL) );
13117
13118 for( ++k; k<kend; ++k ) {
13119 a1 = set( A(i,k) );
13120 xmm1 += a1 * B.load(k,j );
13121 xmm2 += a1 * B.load(k,j+SIMDSIZE );
13122 xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
13123 }
13124
13125 C.store( i, j , C.load(i,j ) - xmm1 * factor );
13126 C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
13127 C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
13128 }
13129 }
13130 }
13131
13132 for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
13133 {
13134 const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
13135 size_t i( LOW ? j : 0UL );
13136
13137 for( ; (i+4UL) <= iend; i+=4UL )
13138 {
13139 const size_t kbegin( ( IsUpper_v<MT4> )
13140 ?( ( IsLower_v<MT5> )
13141 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13142 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13143 :( IsLower_v<MT5> ? j : 0UL ) );
13144 const size_t kend( ( IsLower_v<MT4> )
13145 ?( ( IsUpper_v<MT5> )
13146 ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
13147 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
13148 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
13149
13150 size_t k( kbegin );
13151
13152 if( k < kend )
13153 {
13154 SIMDType a1( set( A(i ,k) ) );
13155 SIMDType a2( set( A(i+1UL,k) ) );
13156 SIMDType a3( set( A(i+2UL,k) ) );
13157 SIMDType a4( set( A(i+3UL,k) ) );
13158 SIMDType b1( B.load(k,j ) );
13159 SIMDType b2( B.load(k,j+SIMDSIZE) );
13160 SIMDType xmm1( a1 * b1 );
13161 SIMDType xmm2( a1 * b2 );
13162 SIMDType xmm3( a2 * b1 );
13163 SIMDType xmm4( a2 * b2 );
13164 SIMDType xmm5( a3 * b1 );
13165 SIMDType xmm6( a3 * b2 );
13166 SIMDType xmm7( a4 * b1 );
13167 SIMDType xmm8( a4 * b2 );
13168
13169 for( ++k; k<kend; ++k ) {
13170 a1 = set( A(i ,k) );
13171 a2 = set( A(i+1UL,k) );
13172 a3 = set( A(i+2UL,k) );
13173 a4 = set( A(i+3UL,k) );
13174 b1 = B.load(k,j );
13175 b2 = B.load(k,j+SIMDSIZE);
13176 xmm1 += a1 * b1;
13177 xmm2 += a1 * b2;
13178 xmm3 += a2 * b1;
13179 xmm4 += a2 * b2;
13180 xmm5 += a3 * b1;
13181 xmm6 += a3 * b2;
13182 xmm7 += a4 * b1;
13183 xmm8 += a4 * b2;
13184 }
13185
13186 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13187 C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
13188 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
13189 C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
13190 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
13191 C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
13192 C.store( i+3UL, j , C.load(i+3UL,j ) - xmm7 * factor );
13193 C.store( i+3UL, j+SIMDSIZE, C.load(i+3UL,j+SIMDSIZE) - xmm8 * factor );
13194 }
13195 }
13196
13197 for( ; (i+3UL) <= iend; i+=3UL )
13198 {
13199 const size_t kbegin( ( IsUpper_v<MT4> )
13200 ?( ( IsLower_v<MT5> )
13201 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13202 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13203 :( IsLower_v<MT5> ? j : 0UL ) );
13204 const size_t kend( ( IsLower_v<MT4> )
13205 ?( ( IsUpper_v<MT5> )
13206 ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
13207 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
13208 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
13209
13210 size_t k( kbegin );
13211
13212 if( k < kend )
13213 {
13214 SIMDType a1( set( A(i ,k) ) );
13215 SIMDType a2( set( A(i+1UL,k) ) );
13216 SIMDType a3( set( A(i+2UL,k) ) );
13217 SIMDType b1( B.load(k,j ) );
13218 SIMDType b2( B.load(k,j+SIMDSIZE) );
13219 SIMDType xmm1( a1 * b1 );
13220 SIMDType xmm2( a1 * b2 );
13221 SIMDType xmm3( a2 * b1 );
13222 SIMDType xmm4( a2 * b2 );
13223 SIMDType xmm5( a3 * b1 );
13224 SIMDType xmm6( a3 * b2 );
13225
13226 for( ++k; k<kend; ++k ) {
13227 a1 = set( A(i ,k) );
13228 a2 = set( A(i+1UL,k) );
13229 a3 = set( A(i+2UL,k) );
13230 b1 = B.load(k,j );
13231 b2 = B.load(k,j+SIMDSIZE);
13232 xmm1 += a1 * b1;
13233 xmm2 += a1 * b2;
13234 xmm3 += a2 * b1;
13235 xmm4 += a2 * b2;
13236 xmm5 += a3 * b1;
13237 xmm6 += a3 * b2;
13238 }
13239
13240 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13241 C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
13242 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
13243 C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
13244 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
13245 C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
13246 }
13247 }
13248
13249 for( ; (i+2UL) <= iend; i+=2UL )
13250 {
13251 const size_t kbegin( ( IsUpper_v<MT4> )
13252 ?( ( IsLower_v<MT5> )
13253 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13254 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13255 :( IsLower_v<MT5> ? j : 0UL ) );
13256 const size_t kend( ( IsLower_v<MT4> )
13257 ?( ( IsUpper_v<MT5> )
13258 ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
13259 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
13260 :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
13261
13262 size_t k( kbegin );
13263
13264 if( k < kend )
13265 {
13266 SIMDType a1( set( A(i ,k) ) );
13267 SIMDType a2( set( A(i+1UL,k) ) );
13268 SIMDType b1( B.load(k,j ) );
13269 SIMDType b2( B.load(k,j+SIMDSIZE) );
13270 SIMDType xmm1( a1 * b1 );
13271 SIMDType xmm2( a1 * b2 );
13272 SIMDType xmm3( a2 * b1 );
13273 SIMDType xmm4( a2 * b2 );
13274
13275 for( ++k; k<kend; ++k ) {
13276 a1 = set( A(i ,k) );
13277 a2 = set( A(i+1UL,k) );
13278 b1 = B.load(k,j );
13279 b2 = B.load(k,j+SIMDSIZE);
13280 xmm1 += a1 * b1;
13281 xmm2 += a1 * b2;
13282 xmm3 += a2 * b1;
13283 xmm4 += a2 * b2;
13284 }
13285
13286 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13287 C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
13288 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
13289 C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
13290 }
13291 }
13292
13293 if( i < iend )
13294 {
13295 const size_t kbegin( ( IsUpper_v<MT4> )
13296 ?( ( IsLower_v<MT5> )
13297 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13298 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13299 :( IsLower_v<MT5> ? j : 0UL ) );
13300 const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
13301
13302 size_t k( kbegin );
13303
13304 if( k < kend )
13305 {
13306 SIMDType a1( set( A(i,k) ) );
13307 SIMDType xmm1( a1 * B.load(k,j ) );
13308 SIMDType xmm2( a1 * B.load(k,j+SIMDSIZE) );
13309
13310 for( ++k; k<kend; ++k ) {
13311 a1 = set( A(i,k) );
13312 xmm1 += a1 * B.load(k,j );
13313 xmm2 += a1 * B.load(k,j+SIMDSIZE);
13314 }
13315
13316 C.store( i, j , C.load(i,j ) - xmm1 * factor );
13317 C.store( i, j+SIMDSIZE, C.load(i,j+SIMDSIZE) - xmm2 * factor );
13318 }
13319 }
13320 }
13321
13322 for( ; j<jpos; j+=SIMDSIZE )
13323 {
13324 const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
13325 size_t i( LOW ? j : 0UL );
13326
13327 for( ; (i+4UL) <= iend; i+=4UL )
13328 {
13329 const size_t kbegin( ( IsUpper_v<MT4> )
13330 ?( ( IsLower_v<MT5> )
13331 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13332 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13333 :( IsLower_v<MT5> ? j : 0UL ) );
13334 const size_t kend( ( IsLower_v<MT4> )
13335 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
13336 :( K ) );
13337
13338 size_t k( kbegin );
13339
13340 if( k < kend )
13341 {
13342 SIMDType b1( B.load(k,j) );
13343 SIMDType xmm1( set( A(i ,k) ) * b1 );
13344 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
13345 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
13346 SIMDType xmm4( set( A(i+3UL,k) ) * b1 );
13347
13348 for( ++k; k<kend; ++k ) {
13349 b1 = B.load(k,j);
13350 xmm1 += set( A(i ,k) ) * b1;
13351 xmm2 += set( A(i+1UL,k) ) * b1;
13352 xmm3 += set( A(i+2UL,k) ) * b1;
13353 xmm4 += set( A(i+3UL,k) ) * b1;
13354 }
13355
13356 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13357 C.store( i+1UL, j, C.load(i+1UL,j) - xmm2 * factor );
13358 C.store( i+2UL, j, C.load(i+2UL,j) - xmm3 * factor );
13359 C.store( i+3UL, j, C.load(i+3UL,j) - xmm4 * factor );
13360 }
13361 }
13362
13363 for( ; (i+3UL) <= iend; i+=3UL )
13364 {
13365 const size_t kbegin( ( IsUpper_v<MT4> )
13366 ?( ( IsLower_v<MT5> )
13367 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13368 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13369 :( IsLower_v<MT5> ? j : 0UL ) );
13370 const size_t kend( ( IsLower_v<MT4> )
13371 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
13372 :( K ) );
13373
13374 size_t k( kbegin );
13375
13376 if( k < kend )
13377 {
13378 SIMDType b1( B.load(k,j) );
13379 SIMDType xmm1( set( A(i ,k) ) * b1 );
13380 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
13381 SIMDType xmm3( set( A(i+2UL,k) ) * b1 );
13382
13383 for( ++k; k<kend; ++k ) {
13384 b1 = B.load(k,j);
13385 xmm1 += set( A(i ,k) ) * b1;
13386 xmm2 += set( A(i+1UL,k) ) * b1;
13387 xmm3 += set( A(i+2UL,k) ) * b1;
13388 }
13389
13390 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13391 C.store( i+1UL, j, C.load(i+1UL,j) - xmm2 * factor );
13392 C.store( i+2UL, j, C.load(i+2UL,j) - xmm3 * factor );
13393 }
13394 }
13395
13396 for( ; (i+2UL) <= iend; i+=2UL )
13397 {
13398 const size_t kbegin( ( IsUpper_v<MT4> )
13399 ?( ( IsLower_v<MT5> )
13400 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13401 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13402 :( IsLower_v<MT5> ? j : 0UL ) );
13403 const size_t kend( ( IsLower_v<MT4> )
13404 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
13405 :( K ) );
13406
13407 size_t k( kbegin );
13408
13409 if( k < kend )
13410 {
13411 SIMDType b1( B.load(k,j) );
13412 SIMDType xmm1( set( A(i ,k) ) * b1 );
13413 SIMDType xmm2( set( A(i+1UL,k) ) * b1 );
13414
13415 for( ++k; k<kend; ++k ) {
13416 b1 = B.load(k,j);
13417 xmm1 += set( A(i ,k) ) * b1;
13418 xmm2 += set( A(i+1UL,k) ) * b1;
13419 }
13420
13421 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13422 C.store( i+1UL, j, C.load(i+1UL,j) - xmm2 * factor );
13423 }
13424 }
13425
13426 if( i < iend )
13427 {
13428 const size_t kbegin( ( IsUpper_v<MT4> )
13429 ?( ( IsLower_v<MT5> )
13430 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13431 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13432 :( IsLower_v<MT5> ? j : 0UL ) );
13433
13434 size_t k( kbegin );
13435
13436 if( k < K )
13437 {
13438 SIMDType xmm1( set( A(i,k) ) * B.load(k,j) );
13439
13440 for( ++k; k<K; ++k ) {
13441 xmm1 += set( A(i,k) ) * B.load(k,j);
13442 }
13443
13444 C.store( i, j, C.load(i,j) - xmm1 * factor );
13445 }
13446 }
13447 }
13448
13449 for( ; remainder && j<N; ++j )
13450 {
13451 const size_t iend( UPP ? j+1UL : M );
13452 size_t i( LOW ? j : 0UL );
13453
13454 for( ; (i+2UL) <= iend; i+=2UL )
13455 {
13456 const size_t kbegin( ( IsUpper_v<MT4> )
13457 ?( ( IsLower_v<MT5> )
13458 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13459 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13460 :( IsLower_v<MT5> ? j : 0UL ) );
13461 const size_t kend( ( IsLower_v<MT4> )
13462 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
13463 :( K ) );
13464
13465 size_t k( kbegin );
13466
13467 if( k < kend )
13468 {
13469 ElementType value1( A(i ,k) * B(k,j) );
13470 ElementType value2( A(i+1UL,k) * B(k,j) );
13471
13472 for( ++k; k<kend; ++k ) {
13473 value1 += A(i ,k) * B(k,j);
13474 value2 += A(i+1UL,k) * B(k,j);
13475 }
13476
13477 C(i ,j) -= value1 * scalar;
13478 C(i+1UL,j) -= value2 * scalar;
13479 }
13480 }
13481
13482 if( i < iend )
13483 {
13484 const size_t kbegin( ( IsUpper_v<MT4> )
13485 ?( ( IsLower_v<MT5> )
13486 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13487 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13488 :( IsLower_v<MT5> ? j : 0UL ) );
13489
13490 size_t k( kbegin );
13491
13492 if( k < K )
13493 {
13494 ElementType value( A(i,k) * B(k,j) );
13495
13496 for( ++k; k<K; ++k ) {
13497 value += A(i,k) * B(k,j);
13498 }
13499
13500 C(i,j) -= value * scalar;
13501 }
13502 }
13503 }
13504 }
13505 //**********************************************************************************************
13506
13507 //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
13522 template< typename MT3 // Type of the left-hand side target matrix
13523 , typename MT4 // Type of the left-hand side matrix operand
13524 , typename MT5 // Type of the right-hand side matrix operand
13525 , typename ST2 > // Type of the scalar value
13526 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
13527 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
13528 {
13529 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
13530
13531 const size_t M( A.rows() );
13532 const size_t N( B.columns() );
13533 const size_t K( A.columns() );
13534
13535 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
13536
13537 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
13538 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
13539
13540 const SIMDType factor( set( scalar ) );
13541
13542 size_t i( 0UL );
13543
13544 if( IsIntegral_v<ElementType> )
13545 {
13546 for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
13547 for( size_t j=0UL; j<N; ++j )
13548 {
13549 const size_t kbegin( ( IsLower_v<MT5> )
13550 ?( ( IsUpper_v<MT4> )
13551 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13552 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13553 :( IsUpper_v<MT4> ? i : 0UL ) );
13554 const size_t kend( ( IsUpper_v<MT5> )
13555 ?( ( IsLower_v<MT4> )
13556 ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
13557 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
13558 :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
13559
13560 size_t k( kbegin );
13561
13562 if( k < kend )
13563 {
13564 SIMDType b1( set( B(k,j) ) );
13565 SIMDType xmm1( A.load(i ,k) * b1 );
13566 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
13567 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
13568 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
13569 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
13570 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,k) * b1 );
13571 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,k) * b1 );
13572 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,k) * b1 );
13573
13574 for( ++k; k<kend; ++k ) {
13575 b1 = set( B(k,j) );
13576 xmm1 += A.load(i ,k) * b1;
13577 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
13578 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
13579 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
13580 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
13581 xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
13582 xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
13583 xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
13584 }
13585
13586 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13587 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
13588 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
13589 C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
13590 C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
13591 C.store( i+SIMDSIZE*5UL, j, C.load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
13592 C.store( i+SIMDSIZE*6UL, j, C.load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
13593 C.store( i+SIMDSIZE*7UL, j, C.load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
13594 }
13595 }
13596 }
13597 }
13598
13599 for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
13600 {
13601 size_t j( 0UL );
13602
13603 for( ; (j+2UL) <= N; j+=2UL )
13604 {
13605 const size_t kbegin( ( IsLower_v<MT5> )
13606 ?( ( IsUpper_v<MT4> )
13607 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13608 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13609 :( IsUpper_v<MT4> ? i : 0UL ) );
13610 const size_t kend( ( IsUpper_v<MT5> )
13611 ?( ( IsLower_v<MT4> )
13612 ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
13613 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
13614 :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
13615
13616 size_t k( kbegin );
13617
13618 if( k < kend )
13619 {
13620 SIMDType a1( A.load(i ,k) );
13621 SIMDType a2( A.load(i+SIMDSIZE ,k) );
13622 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
13623 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
13624 SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
13625 SIMDType b1( set( B(k,j ) ) );
13626 SIMDType b2( set( B(k,j+1UL) ) );
13627 SIMDType xmm1 ( a1 * b1 );
13628 SIMDType xmm2 ( a2 * b1 );
13629 SIMDType xmm3 ( a3 * b1 );
13630 SIMDType xmm4 ( a4 * b1 );
13631 SIMDType xmm5 ( a5 * b1 );
13632 SIMDType xmm6 ( a1 * b2 );
13633 SIMDType xmm7 ( a2 * b2 );
13634 SIMDType xmm8 ( a3 * b2 );
13635 SIMDType xmm9 ( a4 * b2 );
13636 SIMDType xmm10( a5 * b2 );
13637
13638 for( ++k; k<kend; ++k ) {
13639 a1 = A.load(i ,k);
13640 a2 = A.load(i+SIMDSIZE ,k);
13641 a3 = A.load(i+SIMDSIZE*2UL,k);
13642 a4 = A.load(i+SIMDSIZE*3UL,k);
13643 a5 = A.load(i+SIMDSIZE*4UL,k);
13644 b1 = set( B(k,j ) );
13645 b2 = set( B(k,j+1UL) );
13646 xmm1 += a1 * b1;
13647 xmm2 += a2 * b1;
13648 xmm3 += a3 * b1;
13649 xmm4 += a4 * b1;
13650 xmm5 += a5 * b1;
13651 xmm6 += a1 * b2;
13652 xmm7 += a2 * b2;
13653 xmm8 += a3 * b2;
13654 xmm9 += a4 * b2;
13655 xmm10 += a5 * b2;
13656 }
13657
13658 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13659 C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
13660 C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
13661 C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
13662 C.store( i+SIMDSIZE*4UL, j , C.load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
13663 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm6 * factor );
13664 C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
13665 C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
13666 C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
13667 C.store( i+SIMDSIZE*4UL, j+1UL, C.load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
13668 }
13669 }
13670
13671 if( j < N )
13672 {
13673 const size_t kbegin( ( IsLower_v<MT5> )
13674 ?( ( IsUpper_v<MT4> )
13675 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13676 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13677 :( IsUpper_v<MT4> ? i : 0UL ) );
13678 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
13679
13680 size_t k( kbegin );
13681
13682 if( k < kend )
13683 {
13684 SIMDType b1( set( B(k,j) ) );
13685 SIMDType xmm1( A.load(i ,k) * b1 );
13686 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
13687 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
13688 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
13689 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,k) * b1 );
13690
13691 for( ++k; k<kend; ++k ) {
13692 b1 = set( B(k,j) );
13693 xmm1 += A.load(i ,k) * b1;
13694 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
13695 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
13696 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
13697 xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
13698 }
13699
13700 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13701 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
13702 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
13703 C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
13704 C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
13705 }
13706 }
13707 }
13708
13709 for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
13710 {
13711 size_t j( 0UL );
13712
13713 for( ; (j+2UL) <= N; j+=2UL )
13714 {
13715 const size_t kbegin( ( IsLower_v<MT5> )
13716 ?( ( IsUpper_v<MT4> )
13717 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13718 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13719 :( IsUpper_v<MT4> ? i : 0UL ) );
13720 const size_t kend( ( IsUpper_v<MT5> )
13721 ?( ( IsLower_v<MT4> )
13722 ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
13723 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
13724 :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
13725
13726 size_t k( kbegin );
13727
13728 if( k < kend )
13729 {
13730 SIMDType a1( A.load(i ,k) );
13731 SIMDType a2( A.load(i+SIMDSIZE ,k) );
13732 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
13733 SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
13734 SIMDType b1( set( B(k,j ) ) );
13735 SIMDType b2( set( B(k,j+1UL) ) );
13736 SIMDType xmm1( a1 * b1 );
13737 SIMDType xmm2( a2 * b1 );
13738 SIMDType xmm3( a3 * b1 );
13739 SIMDType xmm4( a4 * b1 );
13740 SIMDType xmm5( a1 * b2 );
13741 SIMDType xmm6( a2 * b2 );
13742 SIMDType xmm7( a3 * b2 );
13743 SIMDType xmm8( a4 * b2 );
13744
13745 for( ++k; k<kend; ++k ) {
13746 a1 = A.load(i ,k);
13747 a2 = A.load(i+SIMDSIZE ,k);
13748 a3 = A.load(i+SIMDSIZE*2UL,k);
13749 a4 = A.load(i+SIMDSIZE*3UL,k);
13750 b1 = set( B(k,j ) );
13751 b2 = set( B(k,j+1UL) );
13752 xmm1 += a1 * b1;
13753 xmm2 += a2 * b1;
13754 xmm3 += a3 * b1;
13755 xmm4 += a4 * b1;
13756 xmm5 += a1 * b2;
13757 xmm6 += a2 * b2;
13758 xmm7 += a3 * b2;
13759 xmm8 += a4 * b2;
13760 }
13761
13762 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13763 C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
13764 C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
13765 C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
13766 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm5 * factor );
13767 C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
13768 C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
13769 C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
13770 }
13771 }
13772
13773 if( j < N )
13774 {
13775 const size_t kbegin( ( IsLower_v<MT5> )
13776 ?( ( IsUpper_v<MT4> )
13777 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13778 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13779 :( IsUpper_v<MT4> ? i : 0UL ) );
13780 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
13781
13782 size_t k( kbegin );
13783
13784 if( k < kend )
13785 {
13786 SIMDType b1( set( B(k,j) ) );
13787 SIMDType xmm1( A.load(i ,k) * b1 );
13788 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
13789 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
13790 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,k) * b1 );
13791
13792 for( ++k; k<kend; ++k ) {
13793 b1 = set( B(k,j) );
13794 xmm1 += A.load(i ,k) * b1;
13795 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
13796 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
13797 xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
13798 }
13799
13800 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13801 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
13802 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
13803 C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
13804 }
13805 }
13806 }
13807
13808 for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
13809 {
13810 size_t j( 0UL );
13811
13812 for( ; (j+2UL) <= N; j+=2UL )
13813 {
13814 const size_t kbegin( ( IsLower_v<MT5> )
13815 ?( ( IsUpper_v<MT4> )
13816 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13817 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13818 :( IsUpper_v<MT4> ? i : 0UL ) );
13819 const size_t kend( ( IsUpper_v<MT5> )
13820 ?( ( IsLower_v<MT4> )
13821 ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
13822 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
13823 :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
13824
13825 size_t k( kbegin );
13826
13827 if( k < kend )
13828 {
13829 SIMDType a1( A.load(i ,k) );
13830 SIMDType a2( A.load(i+SIMDSIZE ,k) );
13831 SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
13832 SIMDType b1( set( B(k,j ) ) );
13833 SIMDType b2( set( B(k,j+1UL) ) );
13834 SIMDType xmm1( a1 * b1 );
13835 SIMDType xmm2( a2 * b1 );
13836 SIMDType xmm3( a3 * b1 );
13837 SIMDType xmm4( a1 * b2 );
13838 SIMDType xmm5( a2 * b2 );
13839 SIMDType xmm6( a3 * b2 );
13840
13841 for( ++k; k<kend; ++k ) {
13842 a1 = A.load(i ,k);
13843 a2 = A.load(i+SIMDSIZE ,k);
13844 a3 = A.load(i+SIMDSIZE*2UL,k);
13845 b1 = set( B(k,j ) );
13846 b2 = set( B(k,j+1UL) );
13847 xmm1 += a1 * b1;
13848 xmm2 += a2 * b1;
13849 xmm3 += a3 * b1;
13850 xmm4 += a1 * b2;
13851 xmm5 += a2 * b2;
13852 xmm6 += a3 * b2;
13853 }
13854
13855 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13856 C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
13857 C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
13858 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm4 * factor );
13859 C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
13860 C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
13861 }
13862 }
13863
13864 if( j < N )
13865 {
13866 const size_t kbegin( ( IsLower_v<MT5> )
13867 ?( ( IsUpper_v<MT4> )
13868 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13869 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13870 :( IsUpper_v<MT4> ? i : 0UL ) );
13871 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
13872
13873 size_t k( kbegin );
13874
13875 if( k < kend )
13876 {
13877 SIMDType b1( set( B(k,j) ) );
13878 SIMDType xmm1( A.load(i ,k) * b1 );
13879 SIMDType xmm2( A.load(i+SIMDSIZE ,k) * b1 );
13880 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,k) * b1 );
13881
13882 for( ++k; k<kend; ++k ) {
13883 b1 = set( B(k,j) );
13884 xmm1 += A.load(i ,k) * b1;
13885 xmm2 += A.load(i+SIMDSIZE ,k) * b1;
13886 xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
13887 }
13888
13889 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13890 C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
13891 C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
13892 }
13893 }
13894 }
13895
13896 for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
13897 {
13898 const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
13899 size_t j( UPP ? i : 0UL );
13900
13901 for( ; (j+4UL) <= jend; j+=4UL )
13902 {
13903 const size_t kbegin( ( IsLower_v<MT5> )
13904 ?( ( IsUpper_v<MT4> )
13905 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13906 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13907 :( IsUpper_v<MT4> ? i : 0UL ) );
13908 const size_t kend( ( IsUpper_v<MT5> )
13909 ?( ( IsLower_v<MT4> )
13910 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
13911 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
13912 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
13913
13914 size_t k( kbegin );
13915
13916 if( k < kend )
13917 {
13918 SIMDType a1( A.load(i ,k) );
13919 SIMDType a2( A.load(i+SIMDSIZE,k) );
13920 SIMDType b1( set( B(k,j ) ) );
13921 SIMDType b2( set( B(k,j+1UL) ) );
13922 SIMDType b3( set( B(k,j+2UL) ) );
13923 SIMDType b4( set( B(k,j+3UL) ) );
13924 SIMDType xmm1( a1 * b1 );
13925 SIMDType xmm2( a2 * b1 );
13926 SIMDType xmm3( a1 * b2 );
13927 SIMDType xmm4( a2 * b2 );
13928 SIMDType xmm5( a1 * b3 );
13929 SIMDType xmm6( a2 * b3 );
13930 SIMDType xmm7( a1 * b4 );
13931 SIMDType xmm8( a2 * b4 );
13932
13933 for( ++k; k<kend; ++k ) {
13934 a1 = A.load(i ,k);
13935 a2 = A.load(i+SIMDSIZE,k);
13936 b1 = set( B(k,j ) );
13937 b2 = set( B(k,j+1UL) );
13938 b3 = set( B(k,j+2UL) );
13939 b4 = set( B(k,j+3UL) );
13940 xmm1 += a1 * b1;
13941 xmm2 += a2 * b1;
13942 xmm3 += a1 * b2;
13943 xmm4 += a2 * b2;
13944 xmm5 += a1 * b3;
13945 xmm6 += a2 * b3;
13946 xmm7 += a1 * b4;
13947 xmm8 += a2 * b4;
13948 }
13949
13950 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13951 C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
13952 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
13953 C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
13954 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
13955 C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
13956 C.store( i , j+3UL, C.load(i ,j+3UL) - xmm7 * factor );
13957 C.store( i+SIMDSIZE, j+3UL, C.load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
13958 }
13959 }
13960
13961 for( ; (j+3UL) <= jend; j+=3UL )
13962 {
13963 const size_t kbegin( ( IsLower_v<MT5> )
13964 ?( ( IsUpper_v<MT4> )
13965 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13966 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13967 :( IsUpper_v<MT4> ? i : 0UL ) );
13968 const size_t kend( ( IsUpper_v<MT5> )
13969 ?( ( IsLower_v<MT4> )
13970 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
13971 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
13972 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
13973
13974 size_t k( kbegin );
13975
13976 if( k < kend )
13977 {
13978 SIMDType a1( A.load(i ,k) );
13979 SIMDType a2( A.load(i+SIMDSIZE,k) );
13980 SIMDType b1( set( B(k,j ) ) );
13981 SIMDType b2( set( B(k,j+1UL) ) );
13982 SIMDType b3( set( B(k,j+2UL) ) );
13983 SIMDType xmm1( a1 * b1 );
13984 SIMDType xmm2( a2 * b1 );
13985 SIMDType xmm3( a1 * b2 );
13986 SIMDType xmm4( a2 * b2 );
13987 SIMDType xmm5( a1 * b3 );
13988 SIMDType xmm6( a2 * b3 );
13989
13990 for( ++k; k<kend; ++k ) {
13991 a1 = A.load(i ,k);
13992 a2 = A.load(i+SIMDSIZE,k);
13993 b1 = set( B(k,j ) );
13994 b2 = set( B(k,j+1UL) );
13995 b3 = set( B(k,j+2UL) );
13996 xmm1 += a1 * b1;
13997 xmm2 += a2 * b1;
13998 xmm3 += a1 * b2;
13999 xmm4 += a2 * b2;
14000 xmm5 += a1 * b3;
14001 xmm6 += a2 * b3;
14002 }
14003
14004 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
14005 C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
14006 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
14007 C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
14008 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
14009 C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
14010 }
14011 }
14012
14013 for( ; (j+2UL) <= jend; j+=2UL )
14014 {
14015 const size_t kbegin( ( IsLower_v<MT5> )
14016 ?( ( IsUpper_v<MT4> )
14017 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14018 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14019 :( IsUpper_v<MT4> ? i : 0UL ) );
14020 const size_t kend( ( IsUpper_v<MT5> )
14021 ?( ( IsLower_v<MT4> )
14022 ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
14023 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
14024 :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
14025
14026 size_t k( kbegin );
14027
14028 if( k < kend )
14029 {
14030 SIMDType a1( A.load(i ,k) );
14031 SIMDType a2( A.load(i+SIMDSIZE,k) );
14032 SIMDType b1( set( B(k,j ) ) );
14033 SIMDType b2( set( B(k,j+1UL) ) );
14034 SIMDType xmm1( a1 * b1 );
14035 SIMDType xmm2( a2 * b1 );
14036 SIMDType xmm3( a1 * b2 );
14037 SIMDType xmm4( a2 * b2 );
14038
14039 for( ++k; k<kend; ++k ) {
14040 a1 = A.load(i ,k);
14041 a2 = A.load(i+SIMDSIZE,k);
14042 b1 = set( B(k,j ) );
14043 b2 = set( B(k,j+1UL) );
14044 xmm1 += a1 * b1;
14045 xmm2 += a2 * b1;
14046 xmm3 += a1 * b2;
14047 xmm4 += a2 * b2;
14048 }
14049
14050 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
14051 C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
14052 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
14053 C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
14054 }
14055 }
14056
14057 if( j < jend )
14058 {
14059 const size_t kbegin( ( IsLower_v<MT5> )
14060 ?( ( IsUpper_v<MT4> )
14061 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14062 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14063 :( IsUpper_v<MT4> ? i : 0UL ) );
14064 const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
14065
14066 size_t k( kbegin );
14067
14068 if( k < kend )
14069 {
14070 SIMDType b1( set( B(k,j) ) );
14071 SIMDType xmm1( A.load(i ,k) * b1 );
14072 SIMDType xmm2( A.load(i+SIMDSIZE,k) * b1 );
14073
14074 for( ++k; k<kend; ++k ) {
14075 b1 = set( B(k,j) );
14076 xmm1 += A.load(i ,k) * b1;
14077 xmm2 += A.load(i+SIMDSIZE,k) * b1;
14078 }
14079
14080 C.store( i , j, C.load(i ,j) - xmm1 * factor );
14081 C.store( i+SIMDSIZE, j, C.load(i+SIMDSIZE,j) - xmm2 * factor );
14082 }
14083 }
14084 }
14085
14086 for( ; i<ipos; i+=SIMDSIZE )
14087 {
14088 const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
14089 size_t j( UPP ? i : 0UL );
14090
14091 for( ; (j+4UL) <= jend; j+=4UL )
14092 {
14093 const size_t kbegin( ( IsLower_v<MT5> )
14094 ?( ( IsUpper_v<MT4> )
14095 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14096 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14097 :( IsUpper_v<MT4> ? i : 0UL ) );
14098 const size_t kend( ( IsUpper_v<MT5> )
14099 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
14100 :( K ) );
14101
14102 size_t k( kbegin );
14103
14104 if( k < kend )
14105 {
14106 SIMDType a1( A.load(i,k) );
14107 SIMDType xmm1( a1 * set( B(k,j ) ) );
14108 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
14109 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
14110 SIMDType xmm4( a1 * set( B(k,j+3UL) ) );
14111
14112 for( ++k; k<kend; ++k ) {
14113 a1 = A.load(i,k);
14114 xmm1 += a1 * set( B(k,j ) );
14115 xmm2 += a1 * set( B(k,j+1UL) );
14116 xmm3 += a1 * set( B(k,j+2UL) );
14117 xmm4 += a1 * set( B(k,j+3UL) );
14118 }
14119
14120 C.store( i, j , C.load(i,j ) - xmm1 * factor );
14121 C.store( i, j+1UL, C.load(i,j+1UL) - xmm2 * factor );
14122 C.store( i, j+2UL, C.load(i,j+2UL) - xmm3 * factor );
14123 C.store( i, j+3UL, C.load(i,j+3UL) - xmm4 * factor );
14124 }
14125 }
14126
14127 for( ; (j+3UL) <= jend; j+=3UL )
14128 {
14129 const size_t kbegin( ( IsLower_v<MT5> )
14130 ?( ( IsUpper_v<MT4> )
14131 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14132 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14133 :( IsUpper_v<MT4> ? i : 0UL ) );
14134 const size_t kend( ( IsUpper_v<MT5> )
14135 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
14136 :( K ) );
14137
14138 size_t k( kbegin );
14139
14140 if( k < kend )
14141 {
14142 SIMDType a1( A.load(i,k) );
14143 SIMDType xmm1( a1 * set( B(k,j ) ) );
14144 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
14145 SIMDType xmm3( a1 * set( B(k,j+2UL) ) );
14146
14147 for( ++k; k<kend; ++k ) {
14148 a1 = A.load(i,k);
14149 xmm1 += a1 * set( B(k,j ) );
14150 xmm2 += a1 * set( B(k,j+1UL) );
14151 xmm3 += a1 * set( B(k,j+2UL) );
14152 }
14153
14154 C.store( i, j , C.load(i,j ) - xmm1 * factor );
14155 C.store( i, j+1UL, C.load(i,j+1UL) - xmm2 * factor );
14156 C.store( i, j+2UL, C.load(i,j+2UL) - xmm3 * factor );
14157 }
14158 }
14159
14160 for( ; (j+2UL) <= jend; j+=2UL )
14161 {
14162 const size_t kbegin( ( IsLower_v<MT5> )
14163 ?( ( IsUpper_v<MT4> )
14164 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14165 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14166 :( IsUpper_v<MT4> ? i : 0UL ) );
14167 const size_t kend( ( IsUpper_v<MT5> )
14168 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
14169 :( K ) );
14170
14171 size_t k( kbegin );
14172
14173 if( k < kend )
14174 {
14175 SIMDType a1( A.load(i,k) );
14176 SIMDType xmm1( a1 * set( B(k,j ) ) );
14177 SIMDType xmm2( a1 * set( B(k,j+1UL) ) );
14178
14179 for( ++k; k<kend; ++k ) {
14180 a1 = A.load(i,k);
14181 xmm1 += a1 * set( B(k,j ) );
14182 xmm2 += a1 * set( B(k,j+1UL) );
14183 }
14184
14185 C.store( i, j , C.load(i,j ) - xmm1 * factor );
14186 C.store( i, j+1UL, C.load(i,j+1UL) - xmm2 * factor );
14187 }
14188 }
14189
14190 if( j < jend )
14191 {
14192 const size_t kbegin( ( IsLower_v<MT5> )
14193 ?( ( IsUpper_v<MT4> )
14194 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14195 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14196 :( IsUpper_v<MT4> ? i : 0UL ) );
14197
14198 size_t k( kbegin );
14199
14200 if( k < K )
14201 {
14202 SIMDType xmm1( A.load(i,k) * set( B(k,j) ) );
14203
14204 for( ++k; k<K; ++k ) {
14205 xmm1 += A.load(i,k) * set( B(k,j) );
14206 }
14207
14208 C.store( i, j, C.load(i,j) - xmm1 * factor );
14209 }
14210 }
14211 }
14212
14213 for( ; remainder && i<M; ++i )
14214 {
14215 const size_t jend( LOW ? i+1UL : N );
14216 size_t j( UPP ? i : 0UL );
14217
14218 for( ; (j+2UL) <= jend; j+=2UL )
14219 {
14220 const size_t kbegin( ( IsLower_v<MT5> )
14221 ?( ( IsUpper_v<MT4> )
14222 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14223 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14224 :( IsUpper_v<MT4> ? i : 0UL ) );
14225 const size_t kend( ( IsUpper_v<MT5> )
14226 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
14227 :( K ) );
14228
14229 size_t k( kbegin );
14230
14231 if( k < kend )
14232 {
14233 ElementType value1( A(i,k) * B(k,j ) );
14234 ElementType value2( A(i,k) * B(k,j+1UL) );
14235
14236 for( ++k; k<kend; ++k ) {
14237 value1 += A(i,k) * B(k,j );
14238 value2 += A(i,k) * B(k,j+1UL);
14239 }
14240
14241 C(i,j ) -= value1 * scalar;
14242 C(i,j+1UL) -= value2 * scalar;
14243 }
14244 }
14245
14246 if( j < jend )
14247 {
14248 const size_t kbegin( ( IsLower_v<MT5> )
14249 ?( ( IsUpper_v<MT4> )
14250 ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14251 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14252 :( IsUpper_v<MT4> ? i : 0UL ) );
14253
14254 size_t k( kbegin );
14255
14256 if( k < K )
14257 {
14258 ElementType value( A(i,k) * B(k,j) );
14259
14260 for( ++k; k<K; ++k ) {
14261 value += A(i,k) * B(k,j);
14262 }
14263
14264 C(i,j) -= value * scalar;
14265 }
14266 }
14267 }
14268 }
14269 //**********************************************************************************************
14270
14271 //**Default subtraction assignment to dense matrices (large matrices)***************************
14285 template< typename MT3 // Type of the left-hand side target matrix
14286 , typename MT4 // Type of the left-hand side matrix operand
14287 , typename MT5 // Type of the right-hand side matrix operand
14288 , typename ST2 > // Type of the scalar value
14289 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
14290 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
14291 {
14292 selectDefaultSubAssignKernel( C, A, B, scalar );
14293 }
14294 //**********************************************************************************************
14295
14296 //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
14311 template< typename MT3 // Type of the left-hand side target matrix
14312 , typename MT4 // Type of the left-hand side matrix operand
14313 , typename MT5 // Type of the right-hand side matrix operand
14314 , typename ST2 > // Type of the scalar value
14315 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
14316 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
14317 {
14318 if( LOW )
14319 lmmm( C, A, B, -scalar, ST2(1) );
14320 else if( UPP )
14321 ummm( C, A, B, -scalar, ST2(1) );
14322 else
14323 mmm( C, A, B, -scalar, ST2(1) );
14324 }
14325 //**********************************************************************************************
14326
14327 //**BLAS-based subtraction assignment to dense matrices (default)*******************************
14341 template< typename MT3 // Type of the left-hand side target matrix
14342 , typename MT4 // Type of the left-hand side matrix operand
14343 , typename MT5 // Type of the right-hand side matrix operand
14344 , typename ST2 > // Type of the scalar value
14345 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
14346 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
14347 {
14348 selectLargeSubAssignKernel( C, A, B, scalar );
14349 }
14350 //**********************************************************************************************
14351
14352 //**BLAS-based subraction assignment to dense matrices******************************************
14353#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
14367 template< typename MT3 // Type of the left-hand side target matrix
14368 , typename MT4 // Type of the left-hand side matrix operand
14369 , typename MT5 // Type of the right-hand side matrix operand
14370 , typename ST2 > // Type of the scalar value
14371 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
14372 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
14373 {
14374 using ET = ElementType_t<MT3>;
14375
14376 if( IsTriangular_v<MT4> ) {
14377 ResultType_t<MT3> tmp( serial( B ) );
14378 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
14379 subAssign( C, tmp );
14380 }
14381 else if( IsTriangular_v<MT5> ) {
14382 ResultType_t<MT3> tmp( serial( A ) );
14383 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
14384 subAssign( C, tmp );
14385 }
14386 else {
14387 gemm( C, A, B, ET(-scalar), ET(1) );
14388 }
14389 }
14390#endif
14391 //**********************************************************************************************
14392
14393 //**Subtraction assignment to sparse matrices***************************************************
14394 // No special implementation for the subtraction assignment to sparse matrices.
14395 //**********************************************************************************************
14396
14397 //**Schur product assignment to dense matrices**************************************************
14409 template< typename MT // Type of the target dense matrix
14410 , bool SO > // Storage order of the target dense matrix
14411 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
14412 {
14414
14418
14419 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
14420 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
14421
14422 const ResultType tmp( serial( rhs ) );
14423 schurAssign( *lhs, tmp );
14424 }
14425 //**********************************************************************************************
14426
14427 //**Schur product assignment to sparse matrices*************************************************
14428 // No special implementation for the Schur product assignment to sparse matrices.
14429 //**********************************************************************************************
14430
14431 //**Multiplication assignment to dense matrices*************************************************
14432 // No special implementation for the multiplication assignment to dense matrices.
14433 //**********************************************************************************************
14434
14435 //**Multiplication assignment to sparse matrices************************************************
14436 // No special implementation for the multiplication assignment to sparse matrices.
14437 //**********************************************************************************************
14438
14439 //**SMP assignment to dense matrices************************************************************
14454 template< typename MT // Type of the target dense matrix
14455 , bool SO > // Storage order of the target dense matrix
14456 friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
14457 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
14458 {
14460
14461 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
14462 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
14463
14464 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
14465 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
14466
14467 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
14468 return;
14469 }
14470 else if( left.columns() == 0UL ) {
14471 reset( *lhs );
14472 return;
14473 }
14474
14475 LT A( left ); // Evaluation of the left-hand side dense matrix operand
14476 RT B( right ); // Evaluation of the right-hand side dense matrix operand
14477
14478 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
14479 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
14480 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
14481 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
14482 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
14483 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
14484
14485 smpAssign( *lhs, A * B * rhs.scalar_ );
14486 }
14487 //**********************************************************************************************
14488
14489 //**SMP assignment to sparse matrices***********************************************************
14504 template< typename MT // Type of the target sparse matrix
14505 , bool SO > // Storage order of the target sparse matrix
14506 friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
14507 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
14508 {
14510
14511 using TmpType = If_t< SO, ResultType, OppositeType >;
14512
14519
14520 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
14521 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
14522
14523 const ForwardFunctor fwd;
14524
14525 const TmpType tmp( rhs );
14526 smpAssign( *lhs, fwd( tmp ) );
14527 }
14528 //**********************************************************************************************
14529
14530 //**SMP addition assignment to dense matrices***************************************************
14545 template< typename MT // Type of the target dense matrix
14546 , bool SO > // Storage order of the target dense matrix
14547 friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
14548 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
14549 {
14551
14552 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
14553 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
14554
14555 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
14556 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
14557
14558 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
14559 return;
14560 }
14561
14562 LT A( left ); // Evaluation of the left-hand side dense matrix operand
14563 RT B( right ); // Evaluation of the right-hand side dense matrix operand
14564
14565 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
14566 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
14567 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
14568 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
14569 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
14570 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
14571
14572 smpAddAssign( *lhs, A * B * rhs.scalar_ );
14573 }
14574 //**********************************************************************************************
14575
14576 //**SMP addition assignment to sparse matrices**************************************************
14577 // No special implementation for the SMP addition assignment to sparse matrices.
14578 //**********************************************************************************************
14579
14580 //**SMP subtraction assignment to dense matrices************************************************
14595 template< typename MT // Type of the target dense matrix
14596 , bool SO > // Storage order of the target dense matrix
14597 friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
14598 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
14599 {
14601
14602 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
14603 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
14604
14605 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
14606 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
14607
14608 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
14609 return;
14610 }
14611
14612 LT A( left ); // Evaluation of the left-hand side dense matrix operand
14613 RT B( right ); // Evaluation of the right-hand side dense matrix operand
14614
14615 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
14616 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
14617 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
14618 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
14619 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
14620 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
14621
14622 smpSubAssign( *lhs, A * B * rhs.scalar_ );
14623 }
14624 //**********************************************************************************************
14625
14626 //**SMP subtraction assignment to sparse matrices***********************************************
14627 // No special implementation for the SMP subtraction assignment to sparse matrices.
14628 //**********************************************************************************************
14629
14630 //**SMP Schur product assignment to dense matrices**********************************************
14642 template< typename MT // Type of the target dense matrix
14643 , bool SO > // Storage order of the target dense matrix
14644 friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
14645 {
14647
14651
14652 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
14653 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
14654
14655 const ResultType tmp( rhs );
14656 smpSchurAssign( *lhs, tmp );
14657 }
14658 //**********************************************************************************************
14659
14660 //**SMP Schur product assignment to sparse matrices*********************************************
14661 // No special implementation for the SMP Schur product assignment to sparse matrices.
14662 //**********************************************************************************************
14663
14664 //**SMP multiplication assignment to dense matrices*********************************************
14665 // No special implementation for the SMP multiplication assignment to dense matrices.
14666 //**********************************************************************************************
14667
14668 //**SMP multiplication assignment to sparse matrices********************************************
14669 // No special implementation for the SMP multiplication assignment to sparse matrices.
14670 //**********************************************************************************************
14671
14672 //**Compile time checks*************************************************************************
14681 //**********************************************************************************************
14682};
14684//*************************************************************************************************
14685
14686
14687
14688
14689//=================================================================================================
14690//
14691// GLOBAL BINARY ARITHMETIC OPERATORS
14692//
14693//=================================================================================================
14694
14695//*************************************************************************************************
14725template< typename MT1 // Type of the left-hand side dense matrix
14726 , typename MT2 > // Type of the right-hand side dense matrix
14727inline decltype(auto)
14728 operator*( const DenseMatrix<MT1,true>& lhs, const DenseMatrix<MT2,false>& rhs )
14729{
14731
14732 if( (*lhs).columns() != (*rhs).rows() ) {
14733 BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
14734 }
14735
14737 return ReturnType( *lhs, *rhs );
14738}
14739//*************************************************************************************************
14740
14741
14742
14743
14744//=================================================================================================
14745//
14746// GLOBAL FUNCTIONS
14747//
14748//=================================================================================================
14749
14750//*************************************************************************************************
14775template< typename MT1 // Type of the left-hand side dense matrix
14776 , typename MT2 // Type of the right-hand side dense matrix
14777 , bool SF // Symmetry flag
14778 , bool HF // Hermitian flag
14779 , bool LF // Lower flag
14780 , bool UF > // Upper flag
14781inline decltype(auto) declsym( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
14782{
14784
14785 if( !isSquare( dm ) ) {
14786 BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
14787 }
14788
14789 using ReturnType = const TDMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
14790 return ReturnType( dm.leftOperand(), dm.rightOperand() );
14791}
14793//*************************************************************************************************
14794
14795
14796//*************************************************************************************************
14821template< typename MT1 // Type of the left-hand side dense matrix
14822 , typename MT2 // Type of the right-hand side dense matrix
14823 , bool SF // Symmetry flag
14824 , bool HF // Hermitian flag
14825 , bool LF // Lower flag
14826 , bool UF > // Upper flag
14827inline decltype(auto) declherm( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
14828{
14830
14831 if( !isSquare( dm ) ) {
14832 BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
14833 }
14834
14835 using ReturnType = const TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
14836 return ReturnType( dm.leftOperand(), dm.rightOperand() );
14837}
14839//*************************************************************************************************
14840
14841
14842//*************************************************************************************************
14867template< typename MT1 // Type of the left-hand side dense matrix
14868 , typename MT2 // Type of the right-hand side dense matrix
14869 , bool SF // Symmetry flag
14870 , bool HF // Hermitian flag
14871 , bool LF // Lower flag
14872 , bool UF > // Upper flag
14873inline decltype(auto) decllow( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
14874{
14876
14877 if( !isSquare( dm ) ) {
14878 BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
14879 }
14880
14881 using ReturnType = const TDMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
14882 return ReturnType( dm.leftOperand(), dm.rightOperand() );
14883}
14885//*************************************************************************************************
14886
14887
14888//*************************************************************************************************
14913template< typename MT1 // Type of the left-hand side dense matrix
14914 , typename MT2 // Type of the right-hand side dense matrix
14915 , bool SF // Symmetry flag
14916 , bool HF // Hermitian flag
14917 , bool UF > // Upper flag
14918inline decltype(auto) declunilow( const TDMatDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
14919{
14921
14922 if( !isSquare( dm ) ) {
14923 BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
14924 }
14925
14926 return declunilow( decllow( *dm ) );
14927}
14929//*************************************************************************************************
14930
14931
14932//*************************************************************************************************
14957template< typename MT1 // Type of the left-hand side dense matrix
14958 , typename MT2 // Type of the right-hand side dense matrix
14959 , bool SF // Symmetry flag
14960 , bool HF // Hermitian flag
14961 , bool UF > // Upper flag
14962inline decltype(auto) declstrlow( const TDMatDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
14963{
14965
14966 if( !isSquare( dm ) ) {
14967 BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
14968 }
14969
14970 return declstrlow( decllow( *dm ) );
14971}
14973//*************************************************************************************************
14974
14975
14976//*************************************************************************************************
15001template< typename MT1 // Type of the left-hand side dense matrix
15002 , typename MT2 // Type of the right-hand side dense matrix
15003 , bool SF // Symmetry flag
15004 , bool HF // Hermitian flag
15005 , bool LF // Lower flag
15006 , bool UF > // Upper flag
15007inline decltype(auto) declupp( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
15008{
15010
15011 if( !isSquare( dm ) ) {
15012 BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
15013 }
15014
15015 using ReturnType = const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
15016 return ReturnType( dm.leftOperand(), dm.rightOperand() );
15017}
15019//*************************************************************************************************
15020
15021
15022//*************************************************************************************************
15047template< typename MT1 // Type of the left-hand side dense matrix
15048 , typename MT2 // Type of the right-hand side dense matrix
15049 , bool SF // Symmetry flag
15050 , bool HF // Hermitian flag
15051 , bool LF > // Lower flag
15052inline decltype(auto) decluniupp( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
15053{
15055
15056 if( !isSquare( dm ) ) {
15057 BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
15058 }
15059
15060 return decluniupp( declupp( *dm ) );
15061}
15063//*************************************************************************************************
15064
15065
15066//*************************************************************************************************
15091template< typename MT1 // Type of the left-hand side dense matrix
15092 , typename MT2 // Type of the right-hand side dense matrix
15093 , bool SF // Symmetry flag
15094 , bool HF // Hermitian flag
15095 , bool LF > // Lower flag
15096inline decltype(auto) declstrupp( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
15097{
15099
15100 if( !isSquare( dm ) ) {
15101 BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
15102 }
15103
15104 return declstrupp( decllow( *dm ) );
15105}
15107//*************************************************************************************************
15108
15109
15110//*************************************************************************************************
15135template< typename MT1 // Type of the left-hand side dense matrix
15136 , typename MT2 // Type of the right-hand side dense matrix
15137 , bool SF // Symmetry flag
15138 , bool HF // Hermitian flag
15139 , bool LF // Lower flag
15140 , bool UF > // Upper flag
15141inline decltype(auto) decldiag( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
15142{
15144
15145 if( !isSquare( dm ) ) {
15146 BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
15147 }
15148
15149 using ReturnType = const TDMatDMatMultExpr<MT1,MT2,SF,HF,true,true>;
15150 return ReturnType( dm.leftOperand(), dm.rightOperand() );
15151}
15153//*************************************************************************************************
15154
15155
15156
15157
15158//=================================================================================================
15159//
15160// SIZE SPECIALIZATIONS
15161//
15162//=================================================================================================
15163
15164//*************************************************************************************************
15166template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
15167struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
15168 : public Size<MT1,0UL>
15169{};
15170
15171template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
15172struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
15173 : public Size<MT2,1UL>
15174{};
15176//*************************************************************************************************
15177
15178
15179
15180
15181//=================================================================================================
15182//
15183// ISALIGNED SPECIALIZATIONS
15184//
15185//=================================================================================================
15186
15187//*************************************************************************************************
15189template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
15190struct IsAligned< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
15191 : public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
15192{};
15194//*************************************************************************************************
15195
15196} // namespace blaze
15197
15198#endif
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.
Definition: Aliases.h:310
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for kernel specific block sizes.
Header file for the blaze::checked and blaze::unchecked instances.
Constraints on the storage order of matrix types.
Header file for the complex data type.
Header file for the conjugate shim.
Header file for the decldiag trait.
Header file for the DeclDiag functor.
Header file for the declherm trait.
Header file for the DeclHerm functor.
Header file for the decllow trait.
Header file for the DeclLow functor.
Header file for the declsym trait.
Header file for the DeclSym functor.
Header file for the declupp trait.
Header file for the DeclUpp functor.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsColumnMajorMatrix type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsRowMajorMatrix type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Header file for the dense matrix multiplication kernels.
Header file for the multiplication trait.
Header file for the Noop functor.
Header file for the prevMultiple shim.
Constraints on the storage order of matrix types.
Header file for all SIMD functionality.
Data type constraint.
Constraint on the data type.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:592
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:548
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:170
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:108
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:602
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:167
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:176
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:159
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:570
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:538
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
MatScalarMultExpr< DenseMatrix< This, SO > > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:162
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:179
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:173
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:611
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:427
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:558
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:437
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:446
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:582
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:528
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:459
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:166
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:610
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:432
Base class for dense matrices.
Definition: DenseMatrix.h:82
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Expression object for transpose dense matrix-dense matrix multiplications.
Definition: TDMatDMatMultExpr.h:148
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:281
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:465
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatDMatMultExpr.h:289
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:278
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:275
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:326
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:155
static constexpr bool LOW
Flag for lower matrices.
Definition: TDMatDMatMultExpr.h:172
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:268
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:166
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:421
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:375
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:152
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: TDMatDMatMultExpr.h:171
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:311
static constexpr bool UPP
Flag for upper matrices.
Definition: TDMatDMatMultExpr.h:173
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:272
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:156
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:271
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:153
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:455
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:477
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:478
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:411
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:267
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:401
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:391
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:269
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatDMatMultExpr.h:302
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:433
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:445
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:161
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:151
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:265
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:270
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:284
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatDMatMultExpr.h:296
static constexpr bool SYM
Flag for symmetric matrices.
Definition: TDMatDMatMultExpr.h:170
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseMatrix base class.
Header file for the MatMatMultExpr base class.
Header file for the MatScalarMultExpr base class.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1339
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1375
decltype(auto) declstrupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly upper.
Definition: DMatDeclStrUppExpr.h:1003
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1464
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:978
decltype(auto) declstrlow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly lower.
Definition: DMatDeclStrLowExpr.h:1003
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1004
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1004
decltype(auto) decluniupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as uniupper.
Definition: DMatDeclUniUppExpr.h:1005
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1005
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1005
decltype(auto) declunilow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as unilower.
Definition: DMatDeclUniLowExpr.h:1004
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: RowMajorMatrix.h:61
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.
Definition: StorageOrder.h:84
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.
Definition: MatMatMultExpr.h:103
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: ColumnMajorMatrix.h:61
constexpr bool IsSIMDCombinable_v
Auxiliary variable template for the IsSIMDCombinable type trait.
Definition: IsSIMDCombinable.h:137
constexpr bool HasSIMDAdd_v
Auxiliary variable template for the HasSIMDAdd type trait.
Definition: HasSIMDAdd.h:187
constexpr bool HasSIMDMult_v
Auxiliary variable template for the HasSIMDMult type trait.
Definition: HasSIMDMult.h:188
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:584
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:1383
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:137
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
constexpr decltype(auto) zero(size_t m, size_t n) noexcept
Creating a zero matrix.
Definition: ZeroMatrix.h:1356
Header file for the exception macros of the math module.
Constraints on the storage order of matrix types.
Header file for all forward declarations for expression class templates.
Header file for the Size type trait.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:127
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:61
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:126
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:61
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:126
Generic wrapper for the decllow() function.
Definition: DeclLow.h:61
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:126
Generic wrapper for the declsym() function.
Definition: DeclSym.h:61
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:126
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:61
Base class for all matrix/matrix multiplication expression templates.
Definition: MatMatMultExpr.h:71
Base template for the MultTrait class.
Definition: MultTrait.h:130
Generic wrapper for the null function.
Definition: Noop.h:62
System settings for the BLAS mode.
System settings for the debugging policy of the Blaze library.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.
Header file for the generic max algorithm.
Header file for the generic min algorithm.