Blaze 3.9
DMatTDMatMultExpr.h
Go to the documentation of this file.
1//=================================================================================================
33//=================================================================================================
34
35#ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
37
38
39//*************************************************************************************************
40// Includes
41//*************************************************************************************************
42
45#include <blaze/math/Aliases.h>
70#include <blaze/math/SIMD.h>
99#include <blaze/system/BLAS.h>
105#include <blaze/util/Assert.h>
106#include <blaze/util/Complex.h>
108#include <blaze/util/EnableIf.h>
111#include <blaze/util/mpl/If.h>
112#include <blaze/util/Types.h>
120
121
122namespace blaze {
123
124//=================================================================================================
125//
126// CLASS DMATTDMATMULTEXPR
127//
128//=================================================================================================
129
130//*************************************************************************************************
137template< typename MT1 // Type of the left-hand side dense matrix
138 , typename MT2 // Type of the right-hand side dense matrix
139 , bool SF // Symmetry flag
140 , bool HF // Hermitian flag
141 , bool LF // Lower flag
142 , bool UF > // Upper flag
144 : public MatMatMultExpr< DenseMatrix< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
145 , private Computation
146{
147 private:
148 //**Type definitions****************************************************************************
155 //**********************************************************************************************
156
157 //**********************************************************************************************
159 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
160 //**********************************************************************************************
161
162 //**********************************************************************************************
164 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
165 //**********************************************************************************************
166
167 //**********************************************************************************************
168 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
169 static constexpr bool HERM = ( HF && !( LF || UF ) );
170 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
171 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
172 //**********************************************************************************************
173
174 //**********************************************************************************************
176
180 template< typename T1, typename T2, typename T3 >
181 static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
183 //**********************************************************************************************
184
185 //**********************************************************************************************
187
190 template< typename T1, typename T2, typename T3 >
191 static constexpr bool UseBlasKernel_v =
192 ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
193 !SYM && !HERM && !LOW && !UPP &&
194 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
195 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
196 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
197 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
198 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
199 IsBLASCompatible_v< ElementType_t<T1> > &&
200 IsBLASCompatible_v< ElementType_t<T2> > &&
201 IsBLASCompatible_v< ElementType_t<T3> > &&
202 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
203 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
205 //**********************************************************************************************
206
207 //**********************************************************************************************
209
212 template< typename T1, typename T2, typename T3 >
213 static constexpr bool UseVectorizedDefaultKernel_v =
214 ( useOptimizedKernels &&
215 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
216 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
217 IsSIMDCombinable_v< ElementType_t<T1>
219 , ElementType_t<T3> > &&
220 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
221 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
223 //**********************************************************************************************
224
225 //**********************************************************************************************
227
230 using ForwardFunctor = If_t< HERM
231 , DeclHerm
232 , If_t< SYM
233 , DeclSym
234 , If_t< LOW
235 , If_t< UPP
236 , DeclDiag
237 , DeclLow >
238 , If_t< UPP
239 , DeclUpp
240 , Noop > > > >;
242 //**********************************************************************************************
243
244 public:
245 //**Type definitions****************************************************************************
248
251
253 using ResultType = typename If_t< HERM
255 , If_t< SYM
257 , If_t< LOW
258 , If_t< UPP
261 , If_t< UPP
263 , MultTrait<RT1,RT2> > > > >::Type;
264
269 using ReturnType = const ElementType;
270 using CompositeType = const ResultType;
271
273 using LeftOperand = If_t< IsExpression_v<MT1>, const MT1, const MT1& >;
274
276 using RightOperand = If_t< IsExpression_v<MT2>, const MT2, const MT2& >;
277
280
283 //**********************************************************************************************
284
285 //**Compilation flags***************************************************************************
287 static constexpr bool simdEnabled =
288 ( !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2> &&
289 MT1::simdEnabled && MT2::simdEnabled &&
290 HasSIMDAdd_v<ET1,ET2> &&
291 HasSIMDMult_v<ET1,ET2> );
292
294 static constexpr bool smpAssignable =
295 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
296 //**********************************************************************************************
297
298 //**SIMD properties*****************************************************************************
300 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
301 //**********************************************************************************************
302
303 //**Constructor*********************************************************************************
309 inline DMatTDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
310 : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
311 , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
312 {
313 BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
314 }
315 //**********************************************************************************************
316
317 //**Access operator*****************************************************************************
324 inline ReturnType operator()( size_t i, size_t j ) const {
325 BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
326 BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
327
328 if( IsDiagonal_v<MT1> ) {
329 return lhs_(i,i) * rhs_(i,j);
330 }
331 else if( IsDiagonal_v<MT2> ) {
332 return lhs_(i,j) * rhs_(j,j);
333 }
334 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
335 const size_t begin( ( IsUpper_v<MT1> )
336 ?( ( IsLower_v<MT2> )
337 ?( max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
338 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
339 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
340 :( ( IsLower_v<MT2> )
341 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
342 :( 0UL ) ) );
343 const size_t end( ( IsLower_v<MT1> )
344 ?( ( IsUpper_v<MT2> )
345 ?( min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
346 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
347 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
348 :( ( IsUpper_v<MT2> )
349 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
350 :( lhs_.columns() ) ) );
351
352 if( begin >= end ) return ElementType();
353
354 const size_t n( end - begin );
355
356 return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
358 }
359 else {
360 return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
361 }
362 }
363 //**********************************************************************************************
364
365 //**At function*********************************************************************************
373 inline ReturnType at( size_t i, size_t j ) const {
374 if( i >= lhs_.rows() ) {
375 BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
376 }
377 if( j >= rhs_.columns() ) {
378 BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
379 }
380 return (*this)(i,j);
381 }
382 //**********************************************************************************************
383
384 //**Rows function*******************************************************************************
389 inline size_t rows() const noexcept {
390 return lhs_.rows();
391 }
392 //**********************************************************************************************
393
394 //**Columns function****************************************************************************
399 inline size_t columns() const noexcept {
400 return rhs_.columns();
401 }
402 //**********************************************************************************************
403
404 //**Left operand access*************************************************************************
409 inline LeftOperand leftOperand() const noexcept {
410 return lhs_;
411 }
412 //**********************************************************************************************
413
414 //**Right operand access************************************************************************
419 inline RightOperand rightOperand() const noexcept {
420 return rhs_;
421 }
422 //**********************************************************************************************
423
424 //**********************************************************************************************
430 template< typename T >
431 inline bool canAlias( const T* alias ) const noexcept {
432 return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
433 }
434 //**********************************************************************************************
435
436 //**********************************************************************************************
442 template< typename T >
443 inline bool isAliased( const T* alias ) const noexcept {
444 return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
445 }
446 //**********************************************************************************************
447
448 //**********************************************************************************************
453 inline bool isAligned() const noexcept {
454 return lhs_.isAligned() && rhs_.isAligned();
455 }
456 //**********************************************************************************************
457
458 //**********************************************************************************************
463 inline bool canSMPAssign() const noexcept {
464 return ( !BLAZE_BLAS_MODE ||
465 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
467 ( rows() * columns() < DMATTDMATMULT_THRESHOLD ) ) &&
468 ( rows() * columns() >= SMP_DMATTDMATMULT_THRESHOLD ) &&
469 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
470 }
471 //**********************************************************************************************
472
473 private:
474 //**Member variables****************************************************************************
477 //**********************************************************************************************
478
479 //**Assignment to dense matrices****************************************************************
492 template< typename MT // Type of the target dense matrix
493 , bool SO > // Storage order of the target dense matrix
494 friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
495 {
497
498 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
499 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
500
501 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
502 return;
503 }
504 else if( rhs.lhs_.columns() == 0UL ) {
505 reset( *lhs );
506 return;
507 }
508
509 LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
510 RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
511
512 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
513 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
514 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
515 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
516 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
517 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
518
519 DMatTDMatMultExpr::selectAssignKernel( *lhs, A, B );
520 }
522 //**********************************************************************************************
523
524 //**Assignment to dense matrices (kernel selection)*********************************************
535 template< typename MT3 // Type of the left-hand side target matrix
536 , typename MT4 // Type of the left-hand side matrix operand
537 , typename MT5 > // Type of the right-hand side matrix operand
538 static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
539 {
540 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
541 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
542 selectSmallAssignKernel( C, A, B );
543 else
544 selectBlasAssignKernel( C, A, B );
545 }
547 //**********************************************************************************************
548
549 //**Default assignment to row-major dense matrices (general/general)****************************
563 template< typename MT3 // Type of the left-hand side target matrix
564 , typename MT4 // Type of the left-hand side matrix operand
565 , typename MT5 > // Type of the right-hand side matrix operand
566 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
567 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
568 {
569 const size_t M( A.rows() );
570 const size_t N( B.columns() );
571 const size_t K( A.columns() );
572
573 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
574
575 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
576 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
577 :( 0UL ) );
578 const size_t iend( ( IsStrictlyUpper_v<MT4> )
579 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
580 :( M ) );
581 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
582
583 for( size_t i=0UL; i<ibegin; ++i ) {
584 for( size_t j=0UL; j<N; ++j ) {
585 reset( C(i,j) );
586 }
587 }
588 for( size_t i=ibegin; i<iend; ++i )
589 {
590 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
591 ?( ( IsStrictlyUpper_v<MT4> )
592 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
593 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
594 :( ( IsStrictlyUpper_v<MT5> )
595 ?( SYM || HERM || UPP ? max( i, 1UL ) : 1UL )
596 :( SYM || HERM || UPP ? i : 0UL ) ) );
597 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
598 ?( ( IsStrictlyLower_v<MT4> )
599 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
600 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
601 :( ( IsStrictlyLower_v<MT5> )
602 ?( LOW ? min(i+1UL,N-1UL) : N-1UL )
603 :( LOW ? i+1UL : N ) ) );
604
605 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
606 for( size_t j=0UL; j<N; ++j ) {
607 reset( C(i,j) );
608 }
609 continue;
610 }
611
612 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
613
614 for( size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
615 reset( C(i,j) );
616 }
617 for( size_t j=jbegin; j<jend; ++j )
618 {
619 const size_t kbegin( ( IsUpper_v<MT4> )
620 ?( ( IsLower_v<MT5> )
621 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
622 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
623 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
624 :( ( IsLower_v<MT5> )
625 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
626 :( 0UL ) ) );
627 const size_t kend( ( IsLower_v<MT4> )
628 ?( ( IsUpper_v<MT5> )
629 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
630 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
631 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
632 :( ( IsUpper_v<MT5> )
633 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
634 :( K ) ) );
635 BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
636
637 C(i,j) = A(i,kbegin) * B(kbegin,j);
638 for( size_t k=kbegin+1UL; k<kend; ++k ) {
639 C(i,j) += A(i,k) * B(k,j);
640 }
641 }
642 for( size_t j=jend; j<N; ++j ) {
643 reset( C(i,j) );
644 }
645 }
646 for( size_t i=iend; i<M; ++i ) {
647 for( size_t j=0UL; j<N; ++j ) {
648 reset( C(i,j) );
649 }
650 }
651
652 if( SYM || HERM ) {
653 for( size_t i=1UL; i<M; ++i ) {
654 for( size_t j=0UL; j<i; ++j ) {
655 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
656 }
657 }
658 }
659 }
661 //**********************************************************************************************
662
663 //**Default assignment to column-major dense matrices (general/general)*************************
677 template< typename MT3 // Type of the left-hand side target matrix
678 , typename MT4 // Type of the left-hand side matrix operand
679 , typename MT5 > // Type of the right-hand side matrix operand
680 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
681 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
682 {
683 const size_t M( A.rows() );
684 const size_t N( B.columns() );
685 const size_t K( A.columns() );
686
687 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
688
689 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
690 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
691 :( 0UL ) );
692 const size_t jend( ( IsStrictlyLower_v<MT5> )
693 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
694 :( N ) );
695 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
696
697 for( size_t j=0UL; j<jbegin; ++j ) {
698 for( size_t i=0UL; i<M; ++i ) {
699 reset( C(i,j) );
700 }
701 }
702 for( size_t j=jbegin; j<jend; ++j )
703 {
704 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
705 ?( ( IsStrictlyLower_v<MT4> )
706 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
707 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
708 :( ( IsStrictlyLower_v<MT4> )
709 ?( SYM || HERM || LOW ? max( j, 1UL ) : 1UL )
710 :( SYM || HERM || LOW ? j : 0UL ) ) );
711 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
712 ?( ( IsStrictlyUpper_v<MT4> )
713 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
714 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
715 :( ( IsStrictlyUpper_v<MT4> )
716 ?( UPP ? min(j+1UL,M-1UL) : M-1UL )
717 :( UPP ? j+1UL : M ) ) );
718
719 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
720 for( size_t i=0UL; i<M; ++i ) {
721 reset( C(i,j) );
722 }
723 continue;
724 }
725
726 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
727
728 for( size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
729 reset( C(i,j) );
730 }
731 for( size_t i=ibegin; i<iend; ++i )
732 {
733 const size_t kbegin( ( IsUpper_v<MT4> )
734 ?( ( IsLower_v<MT5> )
735 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
736 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
737 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
738 :( ( IsLower_v<MT5> )
739 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
740 :( 0UL ) ) );
741 const size_t kend( ( IsLower_v<MT4> )
742 ?( ( IsUpper_v<MT5> )
743 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
744 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
745 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
746 :( ( IsUpper_v<MT5> )
747 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
748 :( K ) ) );
749 BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
750
751 C(i,j) = A(i,kbegin) * B(kbegin,j);
752 for( size_t k=kbegin+1UL; k<kend; ++k ) {
753 C(i,j) += A(i,k) * B(k,j);
754 }
755 }
756 for( size_t i=iend; i<M; ++i ) {
757 reset( C(i,j) );
758 }
759 }
760 for( size_t j=jend; j<N; ++j ) {
761 for( size_t i=0UL; i<M; ++i ) {
762 reset( C(i,j) );
763 }
764 }
765
766 if( SYM || HERM ) {
767 for( size_t j=1UL; j<N; ++j ) {
768 for( size_t i=0UL; i<j; ++i ) {
769 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
770 }
771 }
772 }
773 }
775 //**********************************************************************************************
776
777 //**Default assignment to row-major dense matrices (general/diagonal)***************************
791 template< typename MT3 // Type of the left-hand side target matrix
792 , typename MT4 // Type of the left-hand side matrix operand
793 , typename MT5 > // Type of the right-hand side matrix operand
794 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
795 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
796 {
797 const size_t M( A.rows() );
798 const size_t N( B.columns() );
799
800 for( size_t i=0UL; i<M; ++i )
801 {
802 const size_t jbegin( ( IsUpper_v<MT4> )
803 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
804 :( 0UL ) );
805 const size_t jend( ( IsLower_v<MT4> )
806 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
807 :( N ) );
808 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
809
810 if( IsUpper_v<MT4> ) {
811 for( size_t j=0UL; j<jbegin; ++j ) {
812 reset( C(i,j) );
813 }
814 }
815 for( size_t j=jbegin; j<jend; ++j ) {
816 C(i,j) = A(i,j) * B(j,j);
817 }
818 if( IsLower_v<MT4> ) {
819 for( size_t j=jend; j<N; ++j ) {
820 reset( C(i,j) );
821 }
822 }
823 }
824 }
826 //**********************************************************************************************
827
828 //**Default assignment to column-major dense matrices (general/diagonal)************************
842 template< typename MT3 // Type of the left-hand side target matrix
843 , typename MT4 // Type of the left-hand side matrix operand
844 , typename MT5 > // Type of the right-hand side matrix operand
845 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
846 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
847 {
848 constexpr size_t block( BLOCK_SIZE );
849
850 const size_t M( A.rows() );
851 const size_t N( B.columns() );
852
853 for( size_t jj=0UL; jj<N; jj+=block ) {
854 const size_t jend( min( N, jj+block ) );
855 for( size_t ii=0UL; ii<M; ii+=block ) {
856 const size_t iend( min( M, ii+block ) );
857 for( size_t j=jj; j<jend; ++j )
858 {
859 const size_t ibegin( ( IsLower_v<MT4> )
860 ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
861 :( ii ) );
862 const size_t ipos( ( IsUpper_v<MT4> )
863 ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
864 :( iend ) );
865
866 if( IsLower_v<MT4> ) {
867 for( size_t i=ii; i<ibegin; ++i ) {
868 reset( C(i,j) );
869 }
870 }
871 for( size_t i=ibegin; i<ipos; ++i ) {
872 C(i,j) = A(i,j) * B(j,j);
873 }
874 if( IsUpper_v<MT4> ) {
875 for( size_t i=ipos; i<iend; ++i ) {
876 reset( C(i,j) );
877 }
878 }
879 }
880 }
881 }
882 }
884 //**********************************************************************************************
885
886 //**Default assignment to row-major dense matrices (diagonal/general)***************************
900 template< typename MT3 // Type of the left-hand side target matrix
901 , typename MT4 // Type of the left-hand side matrix operand
902 , typename MT5 > // Type of the right-hand side matrix operand
903 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
904 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
905 {
906 constexpr size_t block( BLOCK_SIZE );
907
908 const size_t M( A.rows() );
909 const size_t N( B.columns() );
910
911 for( size_t ii=0UL; ii<M; ii+=block ) {
912 const size_t iend( min( M, ii+block ) );
913 for( size_t jj=0UL; jj<N; jj+=block ) {
914 const size_t jend( min( N, jj+block ) );
915 for( size_t i=ii; i<iend; ++i )
916 {
917 const size_t jbegin( ( IsUpper_v<MT5> )
918 ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
919 :( jj ) );
920 const size_t jpos( ( IsLower_v<MT5> )
921 ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
922 :( jend ) );
923
924 if( IsUpper_v<MT5> ) {
925 for( size_t j=jj; j<jbegin; ++j ) {
926 reset( C(i,j) );
927 }
928 }
929 for( size_t j=jbegin; j<jpos; ++j ) {
930 C(i,j) = A(i,i) * B(i,j);
931 }
932 if( IsLower_v<MT5> ) {
933 for( size_t j=jpos; j<jend; ++j ) {
934 reset( C(i,j) );
935 }
936 }
937 }
938 }
939 }
940 }
942 //**********************************************************************************************
943
944 //**Default assignment to column-major dense matrices (diagonal/general)************************
958 template< typename MT3 // Type of the left-hand side target matrix
959 , typename MT4 // Type of the left-hand side matrix operand
960 , typename MT5 > // Type of the right-hand side matrix operand
961 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
962 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
963 {
964 const size_t M( A.rows() );
965 const size_t N( B.columns() );
966
967 for( size_t j=0UL; j<N; ++j )
968 {
969 const size_t ibegin( ( IsLower_v<MT5> )
970 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
971 :( 0UL ) );
972 const size_t iend( ( IsUpper_v<MT5> )
973 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
974 :( M ) );
975 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
976
977 if( IsLower_v<MT5> ) {
978 for( size_t i=0UL; i<ibegin; ++i ) {
979 reset( C(i,j) );
980 }
981 }
982 for( size_t i=ibegin; i<iend; ++i ) {
983 C(i,j) = A(i,i) * B(i,j);
984 }
985 if( IsUpper_v<MT5> ) {
986 for( size_t i=iend; i<M; ++i ) {
987 reset( C(i,j) );
988 }
989 }
990 }
991 }
993 //**********************************************************************************************
994
995 //**Default assignment to dense matrices (diagonal/diagonal)************************************
1009 template< typename MT3 // Type of the left-hand side target matrix
1010 , typename MT4 // Type of the left-hand side matrix operand
1011 , typename MT5 > // Type of the right-hand side matrix operand
1012 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
1013 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1014 {
1015 reset( C );
1016
1017 for( size_t i=0UL; i<A.rows(); ++i ) {
1018 C(i,i) = A(i,i) * B(i,i);
1019 }
1020 }
1022 //**********************************************************************************************
1023
1024 //**Default assignment to dense matrices (small matrices)***************************************
1038 template< typename MT3 // Type of the left-hand side target matrix
1039 , typename MT4 // Type of the left-hand side matrix operand
1040 , typename MT5 > // Type of the right-hand side matrix operand
1041 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1042 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1043 {
1044 selectDefaultAssignKernel( C, A, B );
1045 }
1047 //**********************************************************************************************
1048
1049 //**Vectorized default assignment to row-major dense matrices (small matrices)******************
1064 template< typename MT3 // Type of the left-hand side target matrix
1065 , typename MT4 // Type of the left-hand side matrix operand
1066 , typename MT5 > // Type of the right-hand side matrix operand
1067 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1068 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1069 {
1070 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
1071
1072 const size_t M( A.rows() );
1073 const size_t N( B.columns() );
1074 const size_t K( A.columns() );
1075
1076 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1077
1078 size_t i( 0UL );
1079
1080 for( ; !( LOW && UPP ) && (i+3UL) <= M; i+=3UL )
1081 {
1082 const size_t jend( LOW ? i+3UL : N );
1083 size_t j( 0UL );
1084
1085 if( SYM || HERM ) {
1086 for( ; j<i; ++j ) {
1087 C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
1088 C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
1089 C(i+2UL,j) = HERM ? conj( C(j,i+2UL) ) : C(j,i+2UL);
1090 }
1091 }
1092 else if( UPP ) {
1093 for( ; j<i; ++j ) {
1094 reset( C(i ,j) );
1095 reset( C(i+1UL,j) );
1096 reset( C(i+2UL,j) );
1097 }
1098 }
1099
1100 for( ; (j+3UL) <= jend; j+=3UL )
1101 {
1102 const size_t kbegin( ( IsUpper_v<MT4> )
1103 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
1104 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
1105 const size_t kend( ( IsLower_v<MT4> )
1106 ?( IsUpper_v<MT5> ? min( i+3UL, j+3UL ) : ( i+3UL ) )
1107 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
1108
1109 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
1110 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
1111
1112 size_t k( kbegin );
1113
1114 if( k < kpos )
1115 {
1116 SIMDType a1( A.load(i ,k) );
1117 SIMDType a2( A.load(i+1UL,k) );
1118 SIMDType a3( A.load(i+2UL,k) );
1119 SIMDType b1( B.load(k,j ) );
1120 SIMDType b2( B.load(k,j+1UL) );
1121 SIMDType b3( B.load(k,j+2UL) );
1122 SIMDType xmm1( a1 * b1 );
1123 SIMDType xmm2( a1 * b2 );
1124 SIMDType xmm3( a1 * b3 );
1125 SIMDType xmm4( a2 * b1 );
1126 SIMDType xmm5( a2 * b2 );
1127 SIMDType xmm6( a2 * b3 );
1128 SIMDType xmm7( a3 * b1 );
1129 SIMDType xmm8( a3 * b2 );
1130 SIMDType xmm9( a3 * b3 );
1131
1132 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
1133 a1 = A.load(i ,k);
1134 a2 = A.load(i+1UL,k);
1135 a3 = A.load(i+2UL,k);
1136 b1 = B.load(k,j );
1137 b2 = B.load(k,j+1UL);
1138 b3 = B.load(k,j+2UL);
1139 xmm1 += a1 * b1;
1140 xmm2 += a1 * b2;
1141 xmm3 += a1 * b3;
1142 xmm4 += a2 * b1;
1143 xmm5 += a2 * b2;
1144 xmm6 += a2 * b3;
1145 xmm7 += a3 * b1;
1146 xmm8 += a3 * b2;
1147 xmm9 += a3 * b3;
1148 }
1149
1150 C(i ,j ) = sum( xmm1 );
1151 C(i ,j+1UL) = sum( xmm2 );
1152 C(i ,j+2UL) = sum( xmm3 );
1153 C(i+1UL,j ) = sum( xmm4 );
1154 C(i+1UL,j+1UL) = sum( xmm5 );
1155 C(i+1UL,j+2UL) = sum( xmm6 );
1156 C(i+2UL,j ) = sum( xmm7 );
1157 C(i+2UL,j+1UL) = sum( xmm8 );
1158 C(i+2UL,j+2UL) = sum( xmm9 );
1159
1160 for( ; remainder && k<kend; ++k ) {
1161 C(i ,j ) += A(i ,k) * B(k,j );
1162 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1163 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1164 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1165 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1166 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1167 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1168 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1169 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL);
1170 }
1171 }
1172 else if( k < kend )
1173 {
1174 ElementType value1( A(i ,k) * B(k,j ) );
1175 ElementType value2( A(i ,k) * B(k,j+1UL) );
1176 ElementType value3( A(i ,k) * B(k,j+2UL) );
1177 ElementType value4( A(i+1UL,k) * B(k,j ) );
1178 ElementType value5( A(i+1UL,k) * B(k,j+1UL) );
1179 ElementType value6( A(i+1UL,k) * B(k,j+2UL) );
1180 ElementType value7( A(i+2UL,k) * B(k,j ) );
1181 ElementType value8( A(i+2UL,k) * B(k,j+1UL) );
1182 ElementType value9( A(i+2UL,k) * B(k,j+2UL) );
1183
1184 for( ++k; k<kend; ++k ) {
1185 value1 += A(i ,k) * B(k,j );
1186 value2 += A(i ,k) * B(k,j+1UL);
1187 value3 += A(i ,k) * B(k,j+2UL);
1188 value4 += A(i+1UL,k) * B(k,j );
1189 value5 += A(i+1UL,k) * B(k,j+1UL);
1190 value6 += A(i+1UL,k) * B(k,j+2UL);
1191 value7 += A(i+2UL,k) * B(k,j );
1192 value8 += A(i+2UL,k) * B(k,j+1UL);
1193 value9 += A(i+2UL,k) * B(k,j+2UL);
1194 }
1195
1196 C(i ,j ) = value1;
1197 C(i ,j+1UL) = value2;
1198 C(i ,j+2UL) = value3;
1199 C(i+1UL,j ) = value4;
1200 C(i+1UL,j+1UL) = value5;
1201 C(i+1UL,j+2UL) = value6;
1202 C(i+2UL,j ) = value7;
1203 C(i+2UL,j+1UL) = value8;
1204 C(i+2UL,j+2UL) = value9;
1205 }
1206 else
1207 {
1208 reset( C(i ,j ) );
1209 reset( C(i ,j+1UL) );
1210 reset( C(i ,j+2UL) );
1211 reset( C(i+1UL,j ) );
1212 reset( C(i+1UL,j+1UL) );
1213 reset( C(i+1UL,j+2UL) );
1214 reset( C(i+2UL,j ) );
1215 reset( C(i+2UL,j+1UL) );
1216 reset( C(i+2UL,j+2UL) );
1217 }
1218 }
1219
1220 for( ; (j+2UL) <= jend; j+=2UL )
1221 {
1222 const size_t kbegin( ( IsUpper_v<MT4> )
1223 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
1224 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
1225 const size_t kend( ( IsLower_v<MT4> )
1226 ?( IsUpper_v<MT5> ? min( i+3UL, j+2UL ) : ( i+3UL ) )
1227 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1228
1229 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
1230 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
1231
1232 size_t k( kbegin );
1233
1234 if( k < kpos )
1235 {
1236 SIMDType a1( A.load(i ,k) );
1237 SIMDType a2( A.load(i+1UL,k) );
1238 SIMDType a3( A.load(i+2UL,k) );
1239 SIMDType b1( B.load(k,j ) );
1240 SIMDType b2( B.load(k,j+1UL) );
1241 SIMDType xmm1( a1 * b1 );
1242 SIMDType xmm2( a1 * b2 );
1243 SIMDType xmm3( a2 * b1 );
1244 SIMDType xmm4( a2 * b2 );
1245 SIMDType xmm5( a3 * b1 );
1246 SIMDType xmm6( a3 * b2 );
1247
1248 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
1249 a1 = A.load(i ,k);
1250 a2 = A.load(i+1UL,k);
1251 a3 = A.load(i+2UL,k);
1252 b1 = B.load(k,j );
1253 b2 = B.load(k,j+1UL);
1254 xmm1 += a1 * b1;
1255 xmm2 += a1 * b2;
1256 xmm3 += a2 * b1;
1257 xmm4 += a2 * b2;
1258 xmm5 += a3 * b1;
1259 xmm6 += a3 * b2;
1260 }
1261
1262 C(i ,j ) = sum( xmm1 );
1263 C(i ,j+1UL) = sum( xmm2 );
1264 C(i+1UL,j ) = sum( xmm3 );
1265 C(i+1UL,j+1UL) = sum( xmm4 );
1266 C(i+2UL,j ) = sum( xmm5 );
1267 C(i+2UL,j+1UL) = sum( xmm6 );
1268
1269 for( ; remainder && k<kend; ++k ) {
1270 C(i ,j ) += A(i ,k) * B(k,j );
1271 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1272 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1273 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1274 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1275 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1276 }
1277 }
1278 else if( k < kend )
1279 {
1280 ElementType value1( A(i ,k) * B(k,j ) );
1281 ElementType value2( A(i ,k) * B(k,j+1UL) );
1282 ElementType value3( A(i+1UL,k) * B(k,j ) );
1283 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
1284 ElementType value5( A(i+2UL,k) * B(k,j ) );
1285 ElementType value6( A(i+2UL,k) * B(k,j+1UL) );
1286
1287 for( ++k; k<kend; ++k ) {
1288 value1 += A(i ,k) * B(k,j );
1289 value2 += A(i ,k) * B(k,j+1UL);
1290 value3 += A(i+1UL,k) * B(k,j );
1291 value4 += A(i+1UL,k) * B(k,j+1UL);
1292 value5 += A(i+2UL,k) * B(k,j );
1293 value6 += A(i+2UL,k) * B(k,j+1UL);
1294 }
1295
1296 C(i ,j ) = value1;
1297 C(i ,j+1UL) = value2;
1298 C(i+1UL,j ) = value3;
1299 C(i+1UL,j+1UL) = value4;
1300 C(i+2UL,j ) = value5;
1301 C(i+2UL,j+1UL) = value6;
1302 }
1303 else
1304 {
1305 reset( C(i ,j ) );
1306 reset( C(i ,j+1UL) );
1307 reset( C(i+1UL,j ) );
1308 reset( C(i+1UL,j+1UL) );
1309 reset( C(i+2UL,j ) );
1310 reset( C(i+2UL,j+1UL) );
1311 }
1312 }
1313
1314 if( j < jend )
1315 {
1316 const size_t kbegin( ( IsUpper_v<MT4> )
1317 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
1318 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
1319 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
1320
1321 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
1322 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
1323
1324 size_t k( kbegin );
1325
1326 if( k < kpos )
1327 {
1328 SIMDType b1( B.load(k,j) );
1329 SIMDType xmm1( A.load(i ,k) * b1 );
1330 SIMDType xmm2( A.load(i+1UL,k) * b1 );
1331 SIMDType xmm3( A.load(i+2UL,k) * b1 );
1332
1333 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
1334 b1 = B.load(k,j);
1335 xmm1 += A.load(i ,k) * b1;
1336 xmm2 += A.load(i+1UL,k) * b1;
1337 xmm3 += A.load(i+2UL,k) * b1;
1338 }
1339
1340 C(i ,j) = sum( xmm1 );
1341 C(i+1UL,j) = sum( xmm2 );
1342 C(i+2UL,j) = sum( xmm3 );
1343
1344 for( ; remainder && k<kend; ++k ) {
1345 C(i ,j) += A(i ,k) * B(k,j);
1346 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1347 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
1348 }
1349 }
1350 else if( k < kend )
1351 {
1352 ElementType value1( A(i ,k) * B(k,j) );
1353 ElementType value2( A(i+1UL,k) * B(k,j) );
1354 ElementType value3( A(i+2UL,k) * B(k,j) );
1355
1356 for( ++k; k<kend; ++k ) {
1357 value1 += A(i ,k) * B(k,j);
1358 value2 += A(i+1UL,k) * B(k,j);
1359 value3 += A(i+2UL,k) * B(k,j);
1360 }
1361
1362 C(i ,j) = value1;
1363 C(i+1UL,j) = value2;
1364 C(i+2UL,j) = value3;
1365 }
1366 else
1367 {
1368 reset( C(i ,j) );
1369 reset( C(i+1UL,j) );
1370 reset( C(i+2UL,j) );
1371 }
1372
1373 if( LOW ) ++j;
1374 }
1375
1376 if( LOW ) {
1377 for( ; j<N; ++j ) {
1378 reset( C(i ,j) );
1379 reset( C(i+1UL,j) );
1380 reset( C(i+2UL,j) );
1381 }
1382 }
1383 }
1384
1385 for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
1386 {
1387 const size_t jend( LOW ? i+2UL : N );
1388 size_t j( 0UL );
1389
1390 if( SYM || HERM ) {
1391 for( ; j<i; ++j ) {
1392 C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
1393 C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
1394 }
1395 }
1396 else if( UPP ) {
1397 for( ; j<i; ++j ) {
1398 reset( C(i ,j) );
1399 reset( C(i+1UL,j) );
1400 }
1401 }
1402
1403 for( ; (j+4UL) <= jend; j+=4UL )
1404 {
1405 const size_t kbegin( ( IsUpper_v<MT4> )
1406 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
1407 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
1408 const size_t kend( ( IsLower_v<MT4> )
1409 ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
1410 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
1411
1412 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
1413 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
1414
1415 size_t k( kbegin );
1416
1417 if( k < kpos )
1418 {
1419 SIMDType a1( A.load(i ,k) );
1420 SIMDType a2( A.load(i+1UL,k) );
1421 SIMDType b1( B.load(k,j ) );
1422 SIMDType b2( B.load(k,j+1UL) );
1423 SIMDType b3( B.load(k,j+2UL) );
1424 SIMDType b4( B.load(k,j+3UL) );
1425 SIMDType xmm1( a1 * b1 );
1426 SIMDType xmm2( a1 * b2 );
1427 SIMDType xmm3( a1 * b3 );
1428 SIMDType xmm4( a1 * b4 );
1429 SIMDType xmm5( a2 * b1 );
1430 SIMDType xmm6( a2 * b2 );
1431 SIMDType xmm7( a2 * b3 );
1432 SIMDType xmm8( a2 * b4 );
1433
1434 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
1435 a1 = A.load(i ,k);
1436 a2 = A.load(i+1UL,k);
1437 b1 = B.load(k,j );
1438 b2 = B.load(k,j+1UL);
1439 b3 = B.load(k,j+2UL);
1440 b4 = B.load(k,j+3UL);
1441 xmm1 += a1 * b1;
1442 xmm2 += a1 * b2;
1443 xmm3 += a1 * b3;
1444 xmm4 += a1 * b4;
1445 xmm5 += a2 * b1;
1446 xmm6 += a2 * b2;
1447 xmm7 += a2 * b3;
1448 xmm8 += a2 * b4;
1449 }
1450
1451 C(i ,j ) = sum( xmm1 );
1452 C(i ,j+1UL) = sum( xmm2 );
1453 C(i ,j+2UL) = sum( xmm3 );
1454 C(i ,j+3UL) = sum( xmm4 );
1455 C(i+1UL,j ) = sum( xmm5 );
1456 C(i+1UL,j+1UL) = sum( xmm6 );
1457 C(i+1UL,j+2UL) = sum( xmm7 );
1458 C(i+1UL,j+3UL) = sum( xmm8 );
1459
1460 for( ; remainder && k<kend; ++k ) {
1461 C(i ,j ) += A(i ,k) * B(k,j );
1462 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1463 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1464 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
1465 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1466 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1467 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1468 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
1469 }
1470 }
1471 else if( k < kend )
1472 {
1473 ElementType value1( A(i ,k) * B(k,j ) );
1474 ElementType value2( A(i ,k) * B(k,j+1UL) );
1475 ElementType value3( A(i ,k) * B(k,j+2UL) );
1476 ElementType value4( A(i ,k) * B(k,j+3UL) );
1477 ElementType value5( A(i+1UL,k) * B(k,j ) );
1478 ElementType value6( A(i+1UL,k) * B(k,j+1UL) );
1479 ElementType value7( A(i+1UL,k) * B(k,j+2UL) );
1480 ElementType value8( A(i+1UL,k) * B(k,j+3UL) );
1481
1482 for( ++k; k<kend; ++k ) {
1483 value1 += A(i ,k) * B(k,j );
1484 value2 += A(i ,k) * B(k,j+1UL);
1485 value3 += A(i ,k) * B(k,j+2UL);
1486 value4 += A(i ,k) * B(k,j+3UL);
1487 value5 += A(i+1UL,k) * B(k,j );
1488 value6 += A(i+1UL,k) * B(k,j+1UL);
1489 value7 += A(i+1UL,k) * B(k,j+2UL);
1490 value8 += A(i+1UL,k) * B(k,j+3UL);
1491 }
1492
1493 C(i ,j ) = value1;
1494 C(i ,j+1UL) = value2;
1495 C(i ,j+2UL) = value3;
1496 C(i ,j+3UL) = value4;
1497 C(i+1UL,j ) = value5;
1498 C(i+1UL,j+1UL) = value6;
1499 C(i+1UL,j+2UL) = value7;
1500 C(i+1UL,j+3UL) = value8;
1501 }
1502 else
1503 {
1504 reset( C(i ,j ) );
1505 reset( C(i ,j+1UL) );
1506 reset( C(i ,j+2UL) );
1507 reset( C(i ,j+3UL) );
1508 reset( C(i+1UL,j ) );
1509 reset( C(i+1UL,j+1UL) );
1510 reset( C(i+1UL,j+2UL) );
1511 reset( C(i+1UL,j+3UL) );
1512 }
1513 }
1514
1515 for( ; (j+2UL) <= jend; j+=2UL )
1516 {
1517 const size_t kbegin( ( IsUpper_v<MT4> )
1518 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
1519 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
1520 const size_t kend( ( IsLower_v<MT4> )
1521 ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
1522 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1523
1524 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
1525 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
1526
1527 size_t k( kbegin );
1528
1529 if( k < kpos )
1530 {
1531 SIMDType a1( A.load(i ,k) );
1532 SIMDType a2( A.load(i+1UL,k) );
1533 SIMDType b1( B.load(k,j ) );
1534 SIMDType b2( B.load(k,j+1UL) );
1535 SIMDType xmm1( a1 * b1 );
1536 SIMDType xmm2( a1 * b2 );
1537 SIMDType xmm3( a2 * b1 );
1538 SIMDType xmm4( a2 * b2 );
1539
1540 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
1541 a1 = A.load(i ,k);
1542 a2 = A.load(i+1UL,k);
1543 b1 = B.load(k,j );
1544 b2 = B.load(k,j+1UL);
1545 xmm1 += a1 * b1;
1546 xmm2 += a1 * b2;
1547 xmm3 += a2 * b1;
1548 xmm4 += a2 * b2;
1549 }
1550
1551 C(i ,j ) = sum( xmm1 );
1552 C(i ,j+1UL) = sum( xmm2 );
1553 C(i+1UL,j ) = sum( xmm3 );
1554 C(i+1UL,j+1UL) = sum( xmm4 );
1555
1556 for( ; remainder && k<kend; ++k ) {
1557 C(i ,j ) += A(i ,k) * B(k,j );
1558 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1559 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1560 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1561 }
1562 }
1563 else if( k < kend )
1564 {
1565 ElementType value1( A(i ,k) * B(k,j ) );
1566 ElementType value2( A(i ,k) * B(k,j+1UL) );
1567 ElementType value3( A(i+1UL,k) * B(k,j ) );
1568 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
1569
1570 for( ++k; k<kend; ++k ) {
1571 value1 += A(i ,k) * B(k,j );
1572 value2 += A(i ,k) * B(k,j+1UL);
1573 value3 += A(i+1UL,k) * B(k,j );
1574 value4 += A(i+1UL,k) * B(k,j+1UL);
1575 }
1576
1577 C(i ,j ) = value1;
1578 C(i ,j+1UL) = value2;
1579 C(i+1UL,j ) = value3;
1580 C(i+1UL,j+1UL) = value4;
1581 }
1582 else
1583 {
1584 reset( C(i ,j ) );
1585 reset( C(i ,j+1UL) );
1586 reset( C(i+1UL,j ) );
1587 reset( C(i+1UL,j+1UL) );
1588 }
1589 }
1590
1591 if( j < jend )
1592 {
1593 const size_t kbegin( ( IsUpper_v<MT4> )
1594 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
1595 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
1596 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
1597
1598 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
1599 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
1600
1601 size_t k( kbegin );
1602
1603 if( k < kpos )
1604 {
1605 SIMDType b1( B.load(k,j) );
1606 SIMDType xmm1( A.load(i ,k) * b1 );
1607 SIMDType xmm2( A.load(i+1UL,k) * b1 );
1608
1609 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
1610 b1 = B.load(k,j);
1611 xmm1 += A.load(i ,k) * b1;
1612 xmm2 += A.load(i+1UL,k) * b1;
1613 }
1614
1615 C(i ,j) = sum( xmm1 );
1616 C(i+1UL,j) = sum( xmm2 );
1617
1618 for( ; remainder && k<kend; ++k ) {
1619 C(i ,j) += A(i ,k) * B(k,j);
1620 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1621 }
1622 }
1623 else if( k < kend )
1624 {
1625 ElementType value1( A(i ,k) * B(k,j) );
1626 ElementType value2( A(i+1UL,k) * B(k,j) );
1627
1628 for( ++k; k<kend; ++k ) {
1629 value1 += A(i ,k) * B(k,j);
1630 value2 += A(i+1UL,k) * B(k,j);
1631 }
1632
1633 C(i ,j) = value1;
1634 C(i+1UL,j) = value2;
1635 }
1636 else
1637 {
1638 reset( C(i ,j) );
1639 reset( C(i+1UL,j) );
1640 }
1641
1642 if( LOW ) ++j;
1643 }
1644
1645 if( LOW ) {
1646 for( ; j<N; ++j ) {
1647 reset( C(i ,j) );
1648 reset( C(i+1UL,j) );
1649 }
1650 }
1651 }
1652
1653 for( ; i<M; ++i )
1654 {
1655 const size_t jend( LOW ? i+1UL : N );
1656 size_t j( 0UL );
1657
1658 if( SYM || HERM ) {
1659 for( ; j<i; ++j ) {
1660 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1661 }
1662 }
1663 else if( UPP ) {
1664 for( ; j<i; ++j ) {
1665 reset( C(i,j) );
1666 }
1667 }
1668
1669 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
1670 {
1671 const size_t kbegin( ( IsUpper_v<MT4> )
1672 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
1673 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
1674 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
1675
1676 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
1677 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
1678
1679 size_t k( kbegin );
1680
1681 if( k < kpos )
1682 {
1683 SIMDType a1( A.load(i,k) );
1684 SIMDType xmm1( a1 * B.load(k,j ) );
1685 SIMDType xmm2( a1 * B.load(k,j+1UL) );
1686 SIMDType xmm3( a1 * B.load(k,j+2UL) );
1687 SIMDType xmm4( a1 * B.load(k,j+3UL) );
1688
1689 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
1690 a1 = A.load(i,k);
1691 xmm1 += a1 * B.load(k,j );
1692 xmm2 += a1 * B.load(k,j+1UL);
1693 xmm3 += a1 * B.load(k,j+2UL);
1694 xmm4 += a1 * B.load(k,j+3UL);
1695 }
1696
1697 C(i,j ) = sum( xmm1 );
1698 C(i,j+1UL) = sum( xmm2 );
1699 C(i,j+2UL) = sum( xmm3 );
1700 C(i,j+3UL) = sum( xmm4 );
1701
1702 for( ; remainder && k<kend; ++k ) {
1703 C(i,j ) += A(i,k) * B(k,j );
1704 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1705 C(i,j+2UL) += A(i,k) * B(k,j+2UL);
1706 C(i,j+3UL) += A(i,k) * B(k,j+3UL);
1707 }
1708 }
1709 else if( k < kend )
1710 {
1711 ElementType value1( A(i,k) * B(k,j ) );
1712 ElementType value2( A(i,k) * B(k,j+1UL) );
1713 ElementType value3( A(i,k) * B(k,j+2UL) );
1714 ElementType value4( A(i,k) * B(k,j+3UL) );
1715
1716 for( ++k; k<kend; ++k ) {
1717 value1 += A(i,k) * B(k,j );
1718 value2 += A(i,k) * B(k,j+1UL);
1719 value3 += A(i,k) * B(k,j+2UL);
1720 value4 += A(i,k) * B(k,j+3UL);
1721 }
1722
1723 C(i,j ) = value1;
1724 C(i,j+1UL) = value2;
1725 C(i,j+2UL) = value3;
1726 C(i,j+3UL) = value4;
1727 }
1728 else
1729 {
1730 reset( C(i,j ) );
1731 reset( C(i,j+1UL) );
1732 reset( C(i,j+2UL) );
1733 reset( C(i,j+3UL) );
1734 }
1735 }
1736
1737 for( ; !( LOW && UPP ) && (j+2UL) <= jend; j+=2UL )
1738 {
1739 const size_t kbegin( ( IsUpper_v<MT4> )
1740 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
1741 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
1742 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
1743
1744 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
1745 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
1746
1747 size_t k( kbegin );
1748
1749 if( k < kpos )
1750 {
1751 SIMDType a1( A.load(i,k) );
1752 SIMDType xmm1( a1 * B.load(k,j ) );
1753 SIMDType xmm2( a1 * B.load(k,j+1UL) );
1754
1755 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
1756 a1 = A.load(i,k);
1757 xmm1 += a1 * B.load(k,j );
1758 xmm2 += a1 * B.load(k,j+1UL);
1759 }
1760
1761 C(i,j ) = sum( xmm1 );
1762 C(i,j+1UL) = sum( xmm2 );
1763
1764 for( ; remainder && k<kend; ++k ) {
1765 C(i,j ) += A(i,k) * B(k,j );
1766 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1767 }
1768 }
1769 else if( k < kend )
1770 {
1771 ElementType value1( A(i,k) * B(k,j ) );
1772 ElementType value2( A(i,k) * B(k,j+1UL) );
1773
1774 for( ++k; k<kend; ++k ) {
1775 value1 += A(i,k) * B(k,j );
1776 value2 += A(i,k) * B(k,j+1UL);
1777 }
1778
1779 C(i,j ) = value1;
1780 C(i,j+1UL) = value2;
1781 }
1782 else
1783 {
1784 reset( C(i,j ) );
1785 reset( C(i,j+1UL) );
1786 }
1787 }
1788
1789 if( j < jend )
1790 {
1791 const size_t kbegin( ( IsUpper_v<MT4> )
1792 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
1793 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
1794
1795 const size_t kpos( remainder ? prevMultiple( K, SIMDSIZE ) : K );
1796 BLAZE_INTERNAL_ASSERT( kpos <= K, "Invalid end calculation" );
1797
1798 size_t k( kbegin );
1799
1800 if( k < kpos )
1801 {
1802 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
1803
1804 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
1805 xmm1 += A.load(i,k) * B.load(k,j);
1806 }
1807
1808 C(i,j) = sum( xmm1 );
1809
1810 for( ; remainder && k<K; ++k ) {
1811 C(i,j) += A(i,k) * B(k,j);
1812 }
1813 }
1814 else if( k < K )
1815 {
1816 ElementType value( A(i,k) * B(k,j) );
1817
1818 for( ++k; k<K; ++k ) {
1819 value += A(i,k) * B(k,j);
1820 }
1821
1822 C(i,j) = value;
1823 }
1824 else
1825 {
1826 reset( C(i,j) );
1827 }
1828
1829 if( LOW ) ++j;
1830 }
1831
1832 if( LOW ) {
1833 for( ; j<N; ++j ) {
1834 reset( C(i,j) );
1835 }
1836 }
1837 }
1838 }
1840 //**********************************************************************************************
1841
1842 //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1857 template< typename MT3 // Type of the left-hand side target matrix
1858 , typename MT4 // Type of the left-hand side matrix operand
1859 , typename MT5 > // Type of the right-hand side matrix operand
1860 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1861 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1862 {
1863 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
1864
1865 const size_t M( A.rows() );
1866 const size_t N( B.columns() );
1867 const size_t K( A.columns() );
1868
1869 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1870
1871 size_t i( 0UL );
1872
1873 for( ; !( LOW && UPP ) && (i+4UL) <= M; i+=4UL )
1874 {
1875 const size_t jend( LOW ? i+4UL : N );
1876 size_t j( 0UL );
1877
1878 if( SYM || HERM ) {
1879 for( ; j<i; ++j ) {
1880 C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
1881 C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
1882 C(i+2UL,j) = HERM ? conj( C(j,i+2UL) ) : C(j,i+2UL);
1883 C(i+3UL,j) = HERM ? conj( C(j,i+3UL) ) : C(j,i+3UL);
1884 }
1885 }
1886 else if( UPP ) {
1887 for( ; j<i; ++j ) {
1888 reset( C(i ,j) );
1889 reset( C(i+1UL,j) );
1890 reset( C(i+2UL,j) );
1891 reset( C(i+3UL,j) );
1892 }
1893 }
1894
1895 for( ; (j+2UL) <= jend; j+=2UL )
1896 {
1897 const size_t kbegin( ( IsUpper_v<MT4> )
1898 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
1899 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
1900 const size_t kend( ( IsLower_v<MT4> )
1901 ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
1902 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1903
1904 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
1905 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
1906
1907 size_t k( kbegin );
1908
1909 if( k < kpos )
1910 {
1911 SIMDType a1( A.load(i ,k) );
1912 SIMDType a2( A.load(i+1UL,k) );
1913 SIMDType a3( A.load(i+2UL,k) );
1914 SIMDType a4( A.load(i+3UL,k) );
1915 SIMDType b1( B.load(k,j ) );
1916 SIMDType b2( B.load(k,j+1UL) );
1917 SIMDType xmm1( a1 * b1 );
1918 SIMDType xmm2( a1 * b2 );
1919 SIMDType xmm3( a2 * b1 );
1920 SIMDType xmm4( a2 * b2 );
1921 SIMDType xmm5( a3 * b1 );
1922 SIMDType xmm6( a3 * b2 );
1923 SIMDType xmm7( a4 * b1 );
1924 SIMDType xmm8( a4 * b2 );
1925
1926 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
1927 a1 = A.load(i ,k);
1928 a2 = A.load(i+1UL,k);
1929 a3 = A.load(i+2UL,k);
1930 a4 = A.load(i+3UL,k);
1931 b1 = B.load(k,j );
1932 b2 = B.load(k,j+1UL);
1933 xmm1 += a1 * b1;
1934 xmm2 += a1 * b2;
1935 xmm3 += a2 * b1;
1936 xmm4 += a2 * b2;
1937 xmm5 += a3 * b1;
1938 xmm6 += a3 * b2;
1939 xmm7 += a4 * b1;
1940 xmm8 += a4 * b2;
1941 }
1942
1943 C(i ,j ) = sum( xmm1 );
1944 C(i ,j+1UL) = sum( xmm2 );
1945 C(i+1UL,j ) = sum( xmm3 );
1946 C(i+1UL,j+1UL) = sum( xmm4 );
1947 C(i+2UL,j ) = sum( xmm5 );
1948 C(i+2UL,j+1UL) = sum( xmm6 );
1949 C(i+3UL,j ) = sum( xmm7 );
1950 C(i+3UL,j+1UL) = sum( xmm8 );
1951
1952 for( ; remainder && k<kend; ++k ) {
1953 C(i ,j ) += A(i ,k) * B(k,j );
1954 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1955 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1956 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1957 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1958 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1959 C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
1960 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
1961 }
1962 }
1963 else if( k < kend )
1964 {
1965 ElementType value1( A(i ,k) * B(k,j ) );
1966 ElementType value2( A(i ,k) * B(k,j+1UL) );
1967 ElementType value3( A(i+1UL,k) * B(k,j ) );
1968 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
1969 ElementType value5( A(i+2UL,k) * B(k,j ) );
1970 ElementType value6( A(i+2UL,k) * B(k,j+1UL) );
1971 ElementType value7( A(i+3UL,k) * B(k,j ) );
1972 ElementType value8( A(i+3UL,k) * B(k,j+1UL) );
1973
1974 for( ++k; k<kend; ++k ) {
1975 value1 += A(i ,k) * B(k,j );
1976 value2 += A(i ,k) * B(k,j+1UL);
1977 value3 += A(i+1UL,k) * B(k,j );
1978 value4 += A(i+1UL,k) * B(k,j+1UL);
1979 value5 += A(i+2UL,k) * B(k,j );
1980 value6 += A(i+2UL,k) * B(k,j+1UL);
1981 value7 += A(i+3UL,k) * B(k,j );
1982 value8 += A(i+3UL,k) * B(k,j+1UL);
1983 }
1984
1985 C(i ,j ) = value1;
1986 C(i ,j+1UL) = value2;
1987 C(i+1UL,j ) = value3;
1988 C(i+1UL,j+1UL) = value4;
1989 C(i+2UL,j ) = value5;
1990 C(i+2UL,j+1UL) = value6;
1991 C(i+3UL,j ) = value7;
1992 C(i+3UL,j+1UL) = value8;
1993 }
1994 else
1995 {
1996 reset( C(i ,j ) );
1997 reset( C(i ,j+1UL) );
1998 reset( C(i+1UL,j ) );
1999 reset( C(i+1UL,j+1UL) );
2000 reset( C(i+2UL,j ) );
2001 reset( C(i+2UL,j+1UL) );
2002 reset( C(i+3UL,j ) );
2003 reset( C(i+3UL,j+1UL) );
2004 }
2005 }
2006
2007 if( j < jend )
2008 {
2009 const size_t kbegin( ( IsUpper_v<MT4> )
2010 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
2011 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
2012 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
2013
2014 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
2015 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
2016
2017 size_t k( kbegin );
2018
2019 if( k < kpos )
2020 {
2021 SIMDType b1( B.load(k,j) );
2022 SIMDType xmm1( A.load(i ,k) * b1 );
2023 SIMDType xmm2( A.load(i+1UL,k) * b1 );
2024 SIMDType xmm3( A.load(i+2UL,k) * b1 );
2025 SIMDType xmm4( A.load(i+3UL,k) * b1 );
2026
2027 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
2028 b1 = B.load(k,j);
2029 xmm1 += A.load(i ,k) * b1;
2030 xmm2 += A.load(i+1UL,k) * b1;
2031 xmm3 += A.load(i+2UL,k) * b1;
2032 xmm4 += A.load(i+3UL,k) * b1;
2033 }
2034
2035 C(i ,j) = sum( xmm1 );
2036 C(i+1UL,j) = sum( xmm2 );
2037 C(i+2UL,j) = sum( xmm3 );
2038 C(i+3UL,j) = sum( xmm4 );
2039
2040 for( ; remainder && k<kend; ++k ) {
2041 C(i ,j) += A(i ,k) * B(k,j);
2042 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2043 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
2044 C(i+3UL,j) += A(i+3UL,k) * B(k,j);
2045 }
2046 }
2047 else if( k < kend )
2048 {
2049 ElementType value1( A(i ,k) * B(k,j) );
2050 ElementType value2( A(i+1UL,k) * B(k,j) );
2051 ElementType value3( A(i+2UL,k) * B(k,j) );
2052 ElementType value4( A(i+3UL,k) * B(k,j) );
2053
2054 for( ++k; k<kend; ++k ) {
2055 value1 += A(i ,k) * B(k,j);
2056 value2 += A(i+1UL,k) * B(k,j);
2057 value3 += A(i+2UL,k) * B(k,j);
2058 value4 += A(i+3UL,k) * B(k,j);
2059 }
2060
2061 C(i ,j) = value1;
2062 C(i+1UL,j) = value2;
2063 C(i+2UL,j) = value3;
2064 C(i+3UL,j) = value4;
2065 }
2066 else
2067 {
2068 reset( C(i ,j) );
2069 reset( C(i+1UL,j) );
2070 reset( C(i+2UL,j) );
2071 reset( C(i+3UL,j) );
2072 }
2073
2074 if( LOW ) ++j;
2075 }
2076
2077 if( LOW ) {
2078 for( ; j<N; ++j ) {
2079 reset( C(i ,j) );
2080 reset( C(i+1UL,j) );
2081 reset( C(i+2UL,j) );
2082 reset( C(i+3UL,j) );
2083 }
2084 }
2085 }
2086
2087 for( ; !( LOW && UPP ) && (i+3UL) <= M; i+=3UL )
2088 {
2089 const size_t jend( LOW ? i+3UL : N );
2090 size_t j( 0UL );
2091
2092 if( SYM || HERM ) {
2093 for( ; j<i; ++j ) {
2094 C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
2095 C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
2096 C(i+2UL,j) = HERM ? conj( C(j,i+2UL) ) : C(j,i+2UL);
2097 }
2098 }
2099 else if( UPP ) {
2100 for( ; j<i; ++j ) {
2101 reset( C(i ,j) );
2102 reset( C(i+1UL,j) );
2103 reset( C(i+2UL,j) );
2104 }
2105 }
2106
2107 for( ; (j+3UL) <= jend; j+=3UL )
2108 {
2109 const size_t kbegin( ( IsUpper_v<MT4> )
2110 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
2111 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
2112 const size_t kend( ( IsLower_v<MT4> )
2113 ?( IsUpper_v<MT5> ? min( i+3UL, j+3UL ) : ( i+3UL ) )
2114 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
2115
2116 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
2117 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
2118
2119 size_t k( kbegin );
2120
2121 if( k < kpos )
2122 {
2123 SIMDType a1( A.load(i ,k) );
2124 SIMDType a2( A.load(i+1UL,k) );
2125 SIMDType a3( A.load(i+2UL,k) );
2126 SIMDType b1( B.load(k,j ) );
2127 SIMDType b2( B.load(k,j+1UL) );
2128 SIMDType b3( B.load(k,j+2UL) );
2129 SIMDType xmm1( a1 * b1 );
2130 SIMDType xmm2( a1 * b2 );
2131 SIMDType xmm3( a1 * b3 );
2132 SIMDType xmm4( a2 * b1 );
2133 SIMDType xmm5( a2 * b2 );
2134 SIMDType xmm6( a2 * b3 );
2135 SIMDType xmm7( a3 * b1 );
2136 SIMDType xmm8( a3 * b2 );
2137 SIMDType xmm9( a3 * b3 );
2138
2139 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
2140 a1 = A.load(i ,k);
2141 a2 = A.load(i+1UL,k);
2142 a3 = A.load(i+2UL,k);
2143 b1 = B.load(k,j );
2144 b2 = B.load(k,j+1UL);
2145 b3 = B.load(k,j+2UL);
2146 xmm1 += a1 * b1;
2147 xmm2 += a1 * b2;
2148 xmm3 += a1 * b3;
2149 xmm4 += a2 * b1;
2150 xmm5 += a2 * b2;
2151 xmm6 += a2 * b3;
2152 xmm7 += a3 * b1;
2153 xmm8 += a3 * b2;
2154 xmm9 += a3 * b3;
2155 }
2156
2157 C(i ,j ) = sum( xmm1 );
2158 C(i ,j+1UL) = sum( xmm2 );
2159 C(i ,j+2UL) = sum( xmm3 );
2160 C(i+1UL,j ) = sum( xmm4 );
2161 C(i+1UL,j+1UL) = sum( xmm5 );
2162 C(i+1UL,j+2UL) = sum( xmm6 );
2163 C(i+2UL,j ) = sum( xmm7 );
2164 C(i+2UL,j+1UL) = sum( xmm8 );
2165 C(i+2UL,j+2UL) = sum( xmm9 );
2166
2167 for( ; remainder && k<kend; ++k ) {
2168 C(i ,j ) += A(i ,k) * B(k,j );
2169 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2170 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
2171 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2172 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2173 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
2174 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2175 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2176 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL);
2177 }
2178 }
2179 else if( k < kend )
2180 {
2181 ElementType value1( A(i ,k) * B(k,j ) );
2182 ElementType value2( A(i ,k) * B(k,j+1UL) );
2183 ElementType value3( A(i ,k) * B(k,j+2UL) );
2184 ElementType value4( A(i+1UL,k) * B(k,j ) );
2185 ElementType value5( A(i+1UL,k) * B(k,j+1UL) );
2186 ElementType value6( A(i+1UL,k) * B(k,j+2UL) );
2187 ElementType value7( A(i+2UL,k) * B(k,j ) );
2188 ElementType value8( A(i+2UL,k) * B(k,j+1UL) );
2189 ElementType value9( A(i+2UL,k) * B(k,j+2UL) );
2190
2191 for( ++k; k<kend; ++k ) {
2192 value1 += A(i ,k) * B(k,j );
2193 value2 += A(i ,k) * B(k,j+1UL);
2194 value3 += A(i ,k) * B(k,j+2UL);
2195 value4 += A(i+1UL,k) * B(k,j );
2196 value5 += A(i+1UL,k) * B(k,j+1UL);
2197 value6 += A(i+1UL,k) * B(k,j+2UL);
2198 value7 += A(i+2UL,k) * B(k,j );
2199 value8 += A(i+2UL,k) * B(k,j+1UL);
2200 value9 += A(i+2UL,k) * B(k,j+2UL);
2201 }
2202
2203 C(i ,j ) = value1;
2204 C(i ,j+1UL) = value2;
2205 C(i ,j+2UL) = value3;
2206 C(i+1UL,j ) = value4;
2207 C(i+1UL,j+1UL) = value5;
2208 C(i+1UL,j+2UL) = value6;
2209 C(i+2UL,j ) = value7;
2210 C(i+2UL,j+1UL) = value8;
2211 C(i+2UL,j+2UL) = value9;
2212 }
2213 else
2214 {
2215 reset( C(i ,j ) );
2216 reset( C(i ,j+1UL) );
2217 reset( C(i ,j+2UL) );
2218 reset( C(i+1UL,j ) );
2219 reset( C(i+1UL,j+1UL) );
2220 reset( C(i+1UL,j+2UL) );
2221 reset( C(i+2UL,j ) );
2222 reset( C(i+2UL,j+1UL) );
2223 reset( C(i+2UL,j+2UL) );
2224 }
2225 }
2226
2227 for( ; (j+2UL) <= jend; j+=2UL )
2228 {
2229 const size_t kbegin( ( IsUpper_v<MT4> )
2230 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
2231 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
2232 const size_t kend( ( IsLower_v<MT4> )
2233 ?( IsUpper_v<MT5> ? min( i+3UL, j+2UL ) : ( i+3UL ) )
2234 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2235
2236 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
2237 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
2238
2239 size_t k( kbegin );
2240
2241 if( k < kpos )
2242 {
2243 SIMDType a1( A.load(i ,k) );
2244 SIMDType a2( A.load(i+1UL,k) );
2245 SIMDType a3( A.load(i+2UL,k) );
2246 SIMDType b1( B.load(k,j ) );
2247 SIMDType b2( B.load(k,j+1UL) );
2248 SIMDType xmm1( a1 * b1 );
2249 SIMDType xmm2( a1 * b2 );
2250 SIMDType xmm3( a2 * b1 );
2251 SIMDType xmm4( a2 * b2 );
2252 SIMDType xmm5( a3 * b1 );
2253 SIMDType xmm6( a3 * b2 );
2254
2255 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
2256 a1 = A.load(i ,k);
2257 a2 = A.load(i+1UL,k);
2258 a3 = A.load(i+2UL,k);
2259 b1 = B.load(k,j );
2260 b2 = B.load(k,j+1UL);
2261 xmm1 += a1 * b1;
2262 xmm2 += a1 * b2;
2263 xmm3 += a2 * b1;
2264 xmm4 += a2 * b2;
2265 xmm5 += a3 * b1;
2266 xmm6 += a3 * b2;
2267 }
2268
2269 C(i ,j ) = sum( xmm1 );
2270 C(i ,j+1UL) = sum( xmm2 );
2271 C(i+1UL,j ) = sum( xmm3 );
2272 C(i+1UL,j+1UL) = sum( xmm4 );
2273 C(i+2UL,j ) = sum( xmm5 );
2274 C(i+2UL,j+1UL) = sum( xmm6 );
2275
2276 for( ; remainder && k<kend; ++k ) {
2277 C(i ,j ) += A(i ,k) * B(k,j );
2278 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2279 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2280 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2281 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2282 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2283 }
2284 }
2285 else if( k < kend )
2286 {
2287 ElementType value1( A(i ,k) * B(k,j ) );
2288 ElementType value2( A(i ,k) * B(k,j+1UL) );
2289 ElementType value3( A(i+1UL,k) * B(k,j ) );
2290 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
2291 ElementType value5( A(i+2UL,k) * B(k,j ) );
2292 ElementType value6( A(i+2UL,k) * B(k,j+1UL) );
2293
2294 for( ++k; k<kend; ++k ) {
2295 value1 += A(i ,k) * B(k,j );
2296 value2 += A(i ,k) * B(k,j+1UL);
2297 value3 += A(i+1UL,k) * B(k,j );
2298 value4 += A(i+1UL,k) * B(k,j+1UL);
2299 value5 += A(i+2UL,k) * B(k,j );
2300 value6 += A(i+2UL,k) * B(k,j+1UL);
2301 }
2302
2303 C(i ,j ) = value1;
2304 C(i ,j+1UL) = value2;
2305 C(i+1UL,j ) = value3;
2306 C(i+1UL,j+1UL) = value4;
2307 C(i+2UL,j ) = value5;
2308 C(i+2UL,j+1UL) = value6;
2309 }
2310 else
2311 {
2312 reset( C(i ,j ) );
2313 reset( C(i ,j+1UL) );
2314 reset( C(i+1UL,j ) );
2315 reset( C(i+1UL,j+1UL) );
2316 reset( C(i+2UL,j ) );
2317 reset( C(i+2UL,j+1UL) );
2318 }
2319 }
2320
2321 if( j < jend )
2322 {
2323 const size_t kbegin( ( IsUpper_v<MT4> )
2324 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
2325 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
2326 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
2327
2328 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
2329 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
2330
2331 size_t k( kbegin );
2332
2333 if( k < kpos )
2334 {
2335 SIMDType b1( B.load(k,j) );
2336 SIMDType xmm1( A.load(i ,k) * b1 );
2337 SIMDType xmm2( A.load(i+1UL,k) * b1 );
2338 SIMDType xmm3( A.load(i+2UL,k) * b1 );
2339
2340 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
2341 b1 = B.load(k,j);
2342 xmm1 += A.load(i ,k) * b1;
2343 xmm2 += A.load(i+1UL,k) * b1;
2344 xmm3 += A.load(i+2UL,k) * b1;
2345 }
2346
2347 C(i ,j) = sum( xmm1 );
2348 C(i+1UL,j) = sum( xmm2 );
2349 C(i+2UL,j) = sum( xmm3 );
2350
2351 for( ; remainder && k<kend; ++k ) {
2352 C(i ,j) += A(i ,k) * B(k,j);
2353 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2354 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
2355 }
2356 }
2357 else if( k < kend )
2358 {
2359 ElementType value1( A(i ,k) * B(k,j) );
2360 ElementType value2( A(i+1UL,k) * B(k,j) );
2361 ElementType value3( A(i+2UL,k) * B(k,j) );
2362
2363 for( ++k; k<kend; ++k ) {
2364 value1 += A(i ,k) * B(k,j);
2365 value2 += A(i+1UL,k) * B(k,j);
2366 value3 += A(i+2UL,k) * B(k,j);
2367 }
2368
2369 C(i ,j) = value1;
2370 C(i+1UL,j) = value2;
2371 C(i+2UL,j) = value3;
2372 }
2373 else
2374 {
2375 reset( C(i ,j) );
2376 reset( C(i+1UL,j) );
2377 reset( C(i+2UL,j) );
2378 }
2379
2380 if( LOW ) ++j;
2381 }
2382
2383 if( LOW ) {
2384 for( ; j<N; ++j ) {
2385 reset( C(i ,j) );
2386 reset( C(i+1UL,j) );
2387 reset( C(i+2UL,j) );
2388 }
2389 }
2390 }
2391
2392 for( ; (i+2UL) <= M; i+=2UL )
2393 {
2394 const size_t jend( LOW ? i+2UL : N );
2395 size_t j( 0UL );
2396
2397 if( SYM || HERM ) {
2398 for( ; j<i; ++j ) {
2399 C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
2400 C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
2401 }
2402 }
2403 else if( UPP ) {
2404 for( ; j<i; ++j ) {
2405 reset( C(i ,j) );
2406 reset( C(i+1UL,j) );
2407 }
2408 }
2409
2410 for( ; (j+2UL) <= jend; j+=2UL )
2411 {
2412 const size_t kbegin( ( IsUpper_v<MT4> )
2413 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
2414 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
2415 const size_t kend( ( IsLower_v<MT4> )
2416 ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
2417 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2418
2419 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
2420 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
2421
2422 size_t k( kbegin );
2423
2424 if( k < kpos )
2425 {
2426 SIMDType a1( A.load(i ,k) );
2427 SIMDType a2( A.load(i+1UL,k) );
2428 SIMDType b1( B.load(k,j ) );
2429 SIMDType b2( B.load(k,j+1UL) );
2430 SIMDType xmm1( a1 * b1 );
2431 SIMDType xmm2( a1 * b2 );
2432 SIMDType xmm3( a2 * b1 );
2433 SIMDType xmm4( a2 * b2 );
2434
2435 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
2436 a1 = A.load(i ,k);
2437 a2 = A.load(i+1UL,k);
2438 b1 = B.load(k,j );
2439 b2 = B.load(k,j+1UL);
2440 xmm1 += a1 * b1;
2441 xmm2 += a1 * b2;
2442 xmm3 += a2 * b1;
2443 xmm4 += a2 * b2;
2444 }
2445
2446 C(i ,j ) = sum( xmm1 );
2447 C(i ,j+1UL) = sum( xmm2 );
2448 C(i+1UL,j ) = sum( xmm3 );
2449 C(i+1UL,j+1UL) = sum( xmm4 );
2450
2451 for( ; remainder && k<kend; ++k ) {
2452 C(i ,j ) += A(i ,k) * B(k,j );
2453 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2454 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2455 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2456 }
2457 }
2458 else if( k < kend )
2459 {
2460 ElementType value1( A(i ,k) * B(k,j ) );
2461 ElementType value2( A(i ,k) * B(k,j+1UL) );
2462 ElementType value3( A(i+1UL,k) * B(k,j ) );
2463 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
2464
2465 for( ++k; k<kend; ++k ) {
2466 value1 += A(i ,k) * B(k,j );
2467 value2 += A(i ,k) * B(k,j+1UL);
2468 value3 += A(i+1UL,k) * B(k,j );
2469 value4 += A(i+1UL,k) * B(k,j+1UL);
2470 }
2471
2472 C(i ,j ) = value1;
2473 C(i ,j+1UL) = value2;
2474 C(i+1UL,j ) = value3;
2475 C(i+1UL,j+1UL) = value4;
2476 }
2477 else
2478 {
2479 reset( C(i ,j ) );
2480 reset( C(i ,j+1UL) );
2481 reset( C(i+1UL,j ) );
2482 reset( C(i+1UL,j+1UL) );
2483 }
2484 }
2485
2486 if( j < jend )
2487 {
2488 const size_t kbegin( ( IsUpper_v<MT4> )
2489 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
2490 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
2491 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
2492
2493 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
2494 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
2495
2496 size_t k( kbegin );
2497
2498 if( k < kpos )
2499 {
2500 SIMDType b1( B.load(k,j) );
2501 SIMDType xmm1( A.load(i ,k) * b1 );
2502 SIMDType xmm2( A.load(i+1UL,k) * b1 );
2503
2504 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
2505 b1 = B.load(k,j);
2506 xmm1 += A.load(i ,k) * b1;
2507 xmm2 += A.load(i+1UL,k) * b1;
2508 }
2509
2510 C(i ,j) = sum( xmm1 );
2511 C(i+1UL,j) = sum( xmm2 );
2512
2513 for( ; remainder && k<kend; ++k ) {
2514 C(i ,j) += A(i ,k) * B(k,j);
2515 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2516 }
2517 }
2518 else if( k < kend )
2519 {
2520 ElementType value1( A(i ,k) * B(k,j) );
2521 ElementType value2( A(i+1UL,k) * B(k,j) );
2522
2523 for( ++k; k<kend; ++k ) {
2524 value1 += A(i ,k) * B(k,j);
2525 value2 += A(i+1UL,k) * B(k,j);
2526 }
2527
2528 C(i ,j) = value1;
2529 C(i+1UL,j) = value2;
2530 }
2531 else
2532 {
2533 reset( C(i ,j) );
2534 reset( C(i+1UL,j) );
2535 }
2536
2537 if( LOW ) ++j;
2538 }
2539
2540 if( LOW ) {
2541 for( ; j<N; ++j ) {
2542 reset( C(i ,j) );
2543 reset( C(i+1UL,j) );
2544 }
2545 }
2546 }
2547
2548 for( ; i<M; ++i )
2549 {
2550 const size_t jend( LOW ? i+1UL : N );
2551 size_t j( 0UL );
2552
2553 if( SYM || HERM ) {
2554 for( ; j<i; ++j ) {
2555 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
2556 }
2557 }
2558 else if( UPP ) {
2559 for( ; j<i; ++j ) {
2560 reset( C(i,j) );
2561 }
2562 }
2563
2564 for( ; !( LOW && UPP ) && (j+2UL) <= jend; j+=2UL )
2565 {
2566 const size_t kbegin( ( IsUpper_v<MT4> )
2567 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
2568 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
2569 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
2570
2571 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
2572 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
2573
2574 size_t k( kbegin );
2575
2576 if( k < kpos )
2577 {
2578 SIMDType a1( A.load(i,k) );
2579 SIMDType xmm1( a1 * B.load(k,j ) );
2580 SIMDType xmm2( a1 * B.load(k,j+1UL) );
2581
2582 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
2583 a1 = A.load(i,k);
2584 xmm1 += a1 * B.load(k,j );
2585 xmm2 += a1 * B.load(k,j+1UL);
2586 }
2587
2588 C(i,j ) = sum( xmm1 );
2589 C(i,j+1UL) = sum( xmm2 );
2590
2591 for( ; remainder && k<kend; ++k ) {
2592 C(i,j ) += A(i,k) * B(k,j );
2593 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2594 }
2595 }
2596 else if( k < kend )
2597 {
2598 ElementType value1( A(i,k) * B(k,j ) );
2599 ElementType value2( A(i,k) * B(k,j+1UL) );
2600
2601 for( ++k; k<kend; ++k ) {
2602 value1 += A(i,k) * B(k,j );
2603 value2 += A(i,k) * B(k,j+1UL);
2604 }
2605
2606 C(i,j ) = value1;
2607 C(i,j+1UL) = value2;
2608 }
2609 else
2610 {
2611 reset( C(i,j ) );
2612 reset( C(i,j+1UL) );
2613 }
2614 }
2615
2616 if( j < jend )
2617 {
2618 const size_t kbegin( ( IsUpper_v<MT4> )
2619 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
2620 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
2621
2622 const size_t kpos( remainder ? prevMultiple( K, SIMDSIZE ) : K );
2623 BLAZE_INTERNAL_ASSERT( kpos <= K, "Invalid end calculation" );
2624
2625 size_t k( kbegin );
2626
2627 if( k < kpos )
2628 {
2629 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
2630
2631 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
2632 xmm1 += A.load(i,k) * B.load(k,j);
2633 }
2634
2635 C(i,j) = sum( xmm1 );
2636
2637 for( ; remainder && k<K; ++k ) {
2638 C(i,j) += A(i,k) * B(k,j);
2639 }
2640 }
2641 else if( k < K )
2642 {
2643 ElementType value( A(i,k) * B(k,j) );
2644
2645 for( ++k; k<K; ++k ) {
2646 value += A(i,k) * B(k,j);
2647 }
2648
2649 C(i,j) = value;
2650 }
2651 else
2652 {
2653 reset( C(i,j) );
2654 }
2655
2656 if( LOW ) ++j;
2657 }
2658
2659 if( LOW ) {
2660 for( ; j<N; ++j ) {
2661 reset( C(i,j) );
2662 }
2663 }
2664 }
2665 }
2667 //**********************************************************************************************
2668
2669 //**Default assignment to dense matrices (large matrices)***************************************
2683 template< typename MT3 // Type of the left-hand side target matrix
2684 , typename MT4 // Type of the left-hand side matrix operand
2685 , typename MT5 > // Type of the right-hand side matrix operand
2686 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
2687 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2688 {
2689 selectDefaultAssignKernel( C, A, B );
2690 }
2692 //**********************************************************************************************
2693
2694 //**Vectorized default assignment to dense matrices (large matrices)****************************
2709 template< typename MT3 // Type of the left-hand side target matrix
2710 , typename MT4 // Type of the left-hand side matrix operand
2711 , typename MT5 > // Type of the right-hand side matrix operand
2712 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
2713 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2714 {
2715 if( SYM )
2716 smmm( C, A, B, ElementType(1) );
2717 else if( HERM )
2718 hmmm( C, A, B, ElementType(1) );
2719 else if( LOW )
2720 lmmm( C, A, B, ElementType(1), ElementType(0) );
2721 else if( UPP )
2722 ummm( C, A, B, ElementType(1), ElementType(0) );
2723 else
2724 mmm( C, A, B, ElementType(1), ElementType(0) );
2725 }
2727 //**********************************************************************************************
2728
2729 //**BLAS-based assignment to dense matrices (default)*******************************************
2743 template< typename MT3 // Type of the left-hand side target matrix
2744 , typename MT4 // Type of the left-hand side matrix operand
2745 , typename MT5 > // Type of the right-hand side matrix operand
2746 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2747 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2748 {
2749 selectLargeAssignKernel( C, A, B );
2750 }
2752 //**********************************************************************************************
2753
2754 //**BLAS-based assignment to dense matrices*****************************************************
2755#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2769 template< typename MT3 // Type of the left-hand side target matrix
2770 , typename MT4 // Type of the left-hand side matrix operand
2771 , typename MT5 > // Type of the right-hand side matrix operand
2772 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2773 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2774 {
2775 using ET = ElementType_t<MT3>;
2776
2777 if( IsTriangular_v<MT4> ) {
2778 assign( C, B );
2779 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2780 }
2781 else if( IsTriangular_v<MT5> ) {
2782 assign( C, A );
2783 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2784 }
2785 else {
2786 gemm( C, A, B, ET(1), ET(0) );
2787 }
2788 }
2790#endif
2791 //**********************************************************************************************
2792
2793 //**Assignment to sparse matrices***************************************************************
2806 template< typename MT // Type of the target sparse matrix
2807 , bool SO > // Storage order of the target sparse matrix
2808 friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
2809 {
2811
2812 using TmpType = If_t< SO, OppositeType, ResultType >;
2813
2820
2821 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
2822 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
2823
2824 const ForwardFunctor fwd;
2825
2826 const TmpType tmp( serial( rhs ) );
2827 assign( *lhs, fwd( tmp ) );
2828 }
2830 //**********************************************************************************************
2831
2832 //**Addition assignment to dense matrices*******************************************************
2845 template< typename MT // Type of the target dense matrix
2846 , bool SO > // Storage order of the target dense matrix
2847 friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
2848 {
2850
2851 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
2852 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
2853
2854 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2855 return;
2856 }
2857
2858 LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2859 RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2860
2861 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2862 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2863 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2864 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2865 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
2866 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
2867
2868 DMatTDMatMultExpr::selectAddAssignKernel( *lhs, A, B );
2869 }
2871 //**********************************************************************************************
2872
2873 //**Addition assignment to dense matrices (kernel selection)************************************
2884 template< typename MT3 // Type of the left-hand side target matrix
2885 , typename MT4 // Type of the left-hand side matrix operand
2886 , typename MT5 > // Type of the right-hand side matrix operand
2887 static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2888 {
2889 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
2890 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
2891 selectSmallAddAssignKernel( C, A, B );
2892 else
2893 selectBlasAddAssignKernel( C, A, B );
2894 }
2896 //**********************************************************************************************
2897
2898 //**Default addition assignment to row-major dense matrices (general/general)*******************
2912 template< typename MT3 // Type of the left-hand side target matrix
2913 , typename MT4 // Type of the left-hand side matrix operand
2914 , typename MT5 > // Type of the right-hand side matrix operand
2915 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2916 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2917 {
2918 const size_t M( A.rows() );
2919 const size_t N( B.columns() );
2920 const size_t K( A.columns() );
2921
2922 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2923
2924 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
2925 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
2926 :( 0UL ) );
2927 const size_t iend( ( IsStrictlyUpper_v<MT4> )
2928 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
2929 :( M ) );
2930 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2931
2932 for( size_t i=ibegin; i<iend; ++i )
2933 {
2934 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
2935 ?( ( IsStrictlyUpper_v<MT4> )
2936 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
2937 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
2938 :( ( IsStrictlyUpper_v<MT5> )
2939 ?( UPP ? max( i, 1UL ) : 1UL )
2940 :( UPP ? i : 0UL ) ) );
2941 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
2942 ?( ( IsStrictlyLower_v<MT4> )
2943 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
2944 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
2945 :( ( IsStrictlyLower_v<MT5> )
2946 ?( LOW ? min(i+1UL,N-1UL) : N-1UL )
2947 :( LOW ? i+1UL : N ) ) );
2948
2949 if( ( LOW || UPP ) && ( jbegin > jend ) ) continue;
2950 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2951
2952 for( size_t j=jbegin; j<jend; ++j )
2953 {
2954 const size_t kbegin( ( IsUpper_v<MT4> )
2955 ?( ( IsLower_v<MT5> )
2956 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2957 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2958 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2959 :( ( IsLower_v<MT5> )
2960 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2961 :( 0UL ) ) );
2962 const size_t kend( ( IsLower_v<MT4> )
2963 ?( ( IsUpper_v<MT5> )
2964 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
2965 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2966 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
2967 :( ( IsUpper_v<MT5> )
2968 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2969 :( K ) ) );
2970 BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
2971
2972 const size_t knum( kend - kbegin );
2973 const size_t kpos( kbegin + prevMultiple( knum, 2UL ) );
2974 BLAZE_INTERNAL_ASSERT( kpos <= kbegin+knum, "Invalid end calculation" );
2975
2976 for( size_t k=kbegin; k<kpos; k+=2UL ) {
2977 C(i,j) += A(i,k ) * B(k ,j);
2978 C(i,j) += A(i,k+1UL) * B(k+1UL,j);
2979 }
2980 if( kpos < kend ) {
2981 C(i,j) += A(i,kpos) * B(kpos,j);
2982 }
2983 }
2984 }
2985 }
2987 //**********************************************************************************************
2988
2989 //**Default addition assignment to column-major dense matrices (general/general)****************
3003 template< typename MT3 // Type of the left-hand side target matrix
3004 , typename MT4 // Type of the left-hand side matrix operand
3005 , typename MT5 > // Type of the right-hand side matrix operand
3006 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3007 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3008 {
3009 const size_t M( A.rows() );
3010 const size_t N( B.columns() );
3011 const size_t K( A.columns() );
3012
3013 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3014
3015 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
3016 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
3017 :( 0UL ) );
3018 const size_t jend( ( IsStrictlyLower_v<MT5> )
3019 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
3020 :( N ) );
3021 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3022
3023 for( size_t j=jbegin; j<jend; ++j )
3024 {
3025 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
3026 ?( ( IsStrictlyLower_v<MT4> )
3027 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
3028 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3029 :( ( IsStrictlyLower_v<MT4> )
3030 ?( LOW ? max( j, 1UL ) : 1UL )
3031 :( LOW ? j : 0UL ) ) );
3032 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
3033 ?( ( IsStrictlyUpper_v<MT4> )
3034 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
3035 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
3036 :( ( IsStrictlyUpper_v<MT4> )
3037 ?( UPP ? min(j+1UL,M-1UL) : M-1UL )
3038 :( UPP ? j+1UL : M ) ) );
3039
3040 if( ( LOW || UPP ) && ( ibegin > iend ) ) continue;
3041 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3042
3043 for( size_t i=ibegin; i<iend; ++i )
3044 {
3045 const size_t kbegin( ( IsUpper_v<MT4> )
3046 ?( ( IsLower_v<MT5> )
3047 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3048 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3049 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3050 :( ( IsLower_v<MT5> )
3051 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3052 :( 0UL ) ) );
3053 const size_t kend( ( IsLower_v<MT4> )
3054 ?( ( IsUpper_v<MT5> )
3055 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
3056 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3057 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3058 :( ( IsUpper_v<MT5> )
3059 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3060 :( K ) ) );
3061 BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
3062
3063 const size_t knum( kend - kbegin );
3064 const size_t kpos( kbegin + prevMultiple( knum, 2UL ) );
3065 BLAZE_INTERNAL_ASSERT( kpos <= kbegin+knum, "Invalid end calculation" );
3066
3067 for( size_t k=kbegin; k<kpos; k+=2UL ) {
3068 C(i,j) += A(i,k ) * B(k ,j);
3069 C(i,j) += A(i,k+1UL) * B(k+1UL,j);
3070 }
3071 if( kpos < kend ) {
3072 C(i,j) += A(i,kpos) * B(kpos,j);
3073 }
3074 }
3075 }
3076 }
3078 //**********************************************************************************************
3079
3080 //**Default addition assignment to row-major dense matrices (general/diagonal)******************
3094 template< typename MT3 // Type of the left-hand side target matrix
3095 , typename MT4 // Type of the left-hand side matrix operand
3096 , typename MT5 > // Type of the right-hand side matrix operand
3097 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3098 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3099 {
3100 const size_t M( A.rows() );
3101 const size_t N( B.columns() );
3102
3103 for( size_t i=0UL; i<M; ++i )
3104 {
3105 const size_t jbegin( ( IsUpper_v<MT4> )
3106 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3107 :( 0UL ) );
3108 const size_t jend( ( IsLower_v<MT4> )
3109 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3110 :( N ) );
3111 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3112
3113 const size_t jnum( jend - jbegin );
3114 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
3115 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
3116
3117 for( size_t j=jbegin; j<jpos; j+=2UL ) {
3118 C(i,j ) += A(i,j ) * B(j ,j );
3119 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
3120 }
3121 if( jpos < jend ) {
3122 C(i,jpos) += A(i,jpos) * B(jpos,jpos);
3123 }
3124 }
3125 }
3127 //**********************************************************************************************
3128
3129 //**Default addition assignment to column-major dense matrices (general/diagonal)***************
3143 template< typename MT3 // Type of the left-hand side target matrix
3144 , typename MT4 // Type of the left-hand side matrix operand
3145 , typename MT5 > // Type of the right-hand side matrix operand
3146 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3147 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3148 {
3149 constexpr size_t block( BLOCK_SIZE );
3150
3151 const size_t M( A.rows() );
3152 const size_t N( B.columns() );
3153
3154 for( size_t jj=0UL; jj<N; jj+=block ) {
3155 const size_t jend( min( N, jj+block ) );
3156 for( size_t ii=0UL; ii<M; ii+=block ) {
3157 const size_t iend( min( M, ii+block ) );
3158 for( size_t j=jj; j<jend; ++j )
3159 {
3160 const size_t ibegin( ( IsLower_v<MT4> )
3161 ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
3162 :( ii ) );
3163 const size_t ipos( ( IsUpper_v<MT4> )
3164 ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
3165 :( iend ) );
3166
3167 for( size_t i=ibegin; i<ipos; ++i ) {
3168 C(i,j) += A(i,j) * B(j,j);
3169 }
3170 }
3171 }
3172 }
3173 }
3175 //**********************************************************************************************
3176
3177 //**Default addition assignment to row-major dense matrices (diagonal/general)******************
3191 template< typename MT3 // Type of the left-hand side target matrix
3192 , typename MT4 // Type of the left-hand side matrix operand
3193 , typename MT5 > // Type of the right-hand side matrix operand
3194 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3195 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3196 {
3197 constexpr size_t block( BLOCK_SIZE );
3198
3199 const size_t M( A.rows() );
3200 const size_t N( B.columns() );
3201
3202 for( size_t ii=0UL; ii<M; ii+=block ) {
3203 const size_t iend( min( M, ii+block ) );
3204 for( size_t jj=0UL; jj<N; jj+=block ) {
3205 const size_t jend( min( N, jj+block ) );
3206 for( size_t i=ii; i<iend; ++i )
3207 {
3208 const size_t jbegin( ( IsUpper_v<MT5> )
3209 ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
3210 :( jj ) );
3211 const size_t jpos( ( IsLower_v<MT5> )
3212 ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
3213 :( jend ) );
3214
3215 for( size_t j=jbegin; j<jpos; ++j ) {
3216 C(i,j) += A(i,i) * B(i,j);
3217 }
3218 }
3219 }
3220 }
3221 }
3223 //**********************************************************************************************
3224
3225 //**Default addition assignment to column-major dense matrices (diagonal/general)***************
3239 template< typename MT3 // Type of the left-hand side target matrix
3240 , typename MT4 // Type of the left-hand side matrix operand
3241 , typename MT5 > // Type of the right-hand side matrix operand
3242 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3243 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3244 {
3245 const size_t M( A.rows() );
3246 const size_t N( B.columns() );
3247
3248 for( size_t j=0UL; j<N; ++j )
3249 {
3250 const size_t ibegin( ( IsLower_v<MT5> )
3251 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3252 :( 0UL ) );
3253 const size_t iend( ( IsUpper_v<MT5> )
3254 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3255 :( M ) );
3256 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3257
3258 const size_t inum( iend - ibegin );
3259 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
3260 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
3261
3262 for( size_t i=ibegin; i<ipos; i+=2UL ) {
3263 C(i ,j) += A(i ,i ) * B(i ,j);
3264 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
3265 }
3266 if( ipos < iend ) {
3267 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
3268 }
3269 }
3270 }
3272 //**********************************************************************************************
3273
3274 //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
3288 template< typename MT3 // Type of the left-hand side target matrix
3289 , typename MT4 // Type of the left-hand side matrix operand
3290 , typename MT5 > // Type of the right-hand side matrix operand
3291 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3292 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3293 {
3294 for( size_t i=0UL; i<A.rows(); ++i ) {
3295 C(i,i) += A(i,i) * B(i,i);
3296 }
3297 }
3299 //**********************************************************************************************
3300
3301 //**Default addition assignment to dense matrices (small matrices)******************************
3315 template< typename MT3 // Type of the left-hand side target matrix
3316 , typename MT4 // Type of the left-hand side matrix operand
3317 , typename MT5 > // Type of the right-hand side matrix operand
3318 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3319 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3320 {
3321 selectDefaultAddAssignKernel( C, A, B );
3322 }
3324 //**********************************************************************************************
3325
3326 //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
3341 template< typename MT3 // Type of the left-hand side target matrix
3342 , typename MT4 // Type of the left-hand side matrix operand
3343 , typename MT5 > // Type of the right-hand side matrix operand
3344 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3345 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3346 {
3347 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
3348
3349 const size_t M( A.rows() );
3350 const size_t N( B.columns() );
3351 const size_t K( A.columns() );
3352
3353 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3354
3355 size_t i( 0UL );
3356
3357 for( ; !( LOW && UPP ) && (i+3UL) <= M; i+=3UL )
3358 {
3359 const size_t jend( LOW ? i+3UL : N );
3360 size_t j( UPP ? i : 0UL );
3361
3362 for( ; (j+3UL) <= jend; j+=3UL )
3363 {
3364 const size_t kbegin( ( IsUpper_v<MT4> )
3365 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
3366 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
3367 const size_t kend( ( IsLower_v<MT4> )
3368 ?( IsUpper_v<MT5> ? min( i+3UL, j+3UL ) : ( i+3UL ) )
3369 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
3370
3371 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
3372 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
3373
3374 size_t k( kbegin );
3375
3376 if( k < kpos )
3377 {
3378 SIMDType a1( A.load(i ,k) );
3379 SIMDType a2( A.load(i+1UL,k) );
3380 SIMDType a3( A.load(i+2UL,k) );
3381 SIMDType b1( B.load(k,j ) );
3382 SIMDType b2( B.load(k,j+1UL) );
3383 SIMDType b3( B.load(k,j+2UL) );
3384 SIMDType xmm1( a1 * b1 );
3385 SIMDType xmm2( a1 * b2 );
3386 SIMDType xmm3( a1 * b3 );
3387 SIMDType xmm4( a2 * b1 );
3388 SIMDType xmm5( a2 * b2 );
3389 SIMDType xmm6( a2 * b3 );
3390 SIMDType xmm7( a3 * b1 );
3391 SIMDType xmm8( a3 * b2 );
3392 SIMDType xmm9( a3 * b3 );
3393
3394 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
3395 a1 = A.load(i ,k);
3396 a2 = A.load(i+1UL,k);
3397 a3 = A.load(i+2UL,k);
3398 b1 = B.load(k,j );
3399 b2 = B.load(k,j+1UL);
3400 b3 = B.load(k,j+2UL);
3401 xmm1 += a1 * b1;
3402 xmm2 += a1 * b2;
3403 xmm3 += a1 * b3;
3404 xmm4 += a2 * b1;
3405 xmm5 += a2 * b2;
3406 xmm6 += a2 * b3;
3407 xmm7 += a3 * b1;
3408 xmm8 += a3 * b2;
3409 xmm9 += a3 * b3;
3410 }
3411
3412 C(i ,j ) += sum( xmm1 );
3413 C(i ,j+1UL) += sum( xmm2 );
3414 C(i ,j+2UL) += sum( xmm3 );
3415 C(i+1UL,j ) += sum( xmm4 );
3416 C(i+1UL,j+1UL) += sum( xmm5 );
3417 C(i+1UL,j+2UL) += sum( xmm6 );
3418 C(i+2UL,j ) += sum( xmm7 );
3419 C(i+2UL,j+1UL) += sum( xmm8 );
3420 C(i+2UL,j+2UL) += sum( xmm9 );
3421
3422 for( ; remainder && k<kend; ++k ) {
3423 C(i ,j ) += A(i ,k) * B(k,j );
3424 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3425 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
3426 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3427 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3428 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
3429 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
3430 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
3431 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL);
3432 }
3433 }
3434 else
3435 {
3436 for( ; k<kend; ++k ) {
3437 C(i ,j ) += A(i ,k) * B(k,j );
3438 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3439 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
3440 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3441 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3442 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
3443 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
3444 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
3445 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL);
3446 }
3447 }
3448 }
3449
3450 for( ; (j+2UL) <= jend; j+=2UL )
3451 {
3452 const size_t kbegin( ( IsUpper_v<MT4> )
3453 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
3454 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
3455 const size_t kend( ( IsLower_v<MT4> )
3456 ?( IsUpper_v<MT5> ? min( i+3UL, j+2UL ) : ( i+3UL ) )
3457 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3458
3459 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
3460 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
3461
3462 size_t k( kbegin );
3463
3464 if( k < kpos )
3465 {
3466 SIMDType a1( A.load(i ,k) );
3467 SIMDType a2( A.load(i+1UL,k) );
3468 SIMDType a3( A.load(i+2UL,k) );
3469 SIMDType b1( B.load(k,j ) );
3470 SIMDType b2( B.load(k,j+1UL) );
3471 SIMDType xmm1( a1 * b1 );
3472 SIMDType xmm2( a1 * b2 );
3473 SIMDType xmm3( a2 * b1 );
3474 SIMDType xmm4( a2 * b2 );
3475 SIMDType xmm5( a3 * b1 );
3476 SIMDType xmm6( a3 * b2 );
3477
3478 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
3479 a1 = A.load(i ,k);
3480 a2 = A.load(i+1UL,k);
3481 a3 = A.load(i+2UL,k);
3482 b1 = B.load(k,j );
3483 b2 = B.load(k,j+1UL);
3484 xmm1 += a1 * b1;
3485 xmm2 += a1 * b2;
3486 xmm3 += a2 * b1;
3487 xmm4 += a2 * b2;
3488 xmm5 += a3 * b1;
3489 xmm6 += a3 * b2;
3490 }
3491
3492 C(i ,j ) += sum( xmm1 );
3493 C(i ,j+1UL) += sum( xmm2 );
3494 C(i+1UL,j ) += sum( xmm3 );
3495 C(i+1UL,j+1UL) += sum( xmm4 );
3496 C(i+2UL,j ) += sum( xmm5 );
3497 C(i+2UL,j+1UL) += sum( xmm6 );
3498
3499 for( ; remainder && k<kend; ++k ) {
3500 C(i ,j ) += A(i ,k) * B(k,j );
3501 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3502 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3503 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3504 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
3505 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
3506 }
3507 }
3508 else
3509 {
3510 for( ; k<kend; ++k ) {
3511 C(i ,j ) += A(i ,k) * B(k,j );
3512 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3513 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3514 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3515 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
3516 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
3517 }
3518 }
3519 }
3520
3521 if( j < jend )
3522 {
3523 const size_t kbegin( ( IsUpper_v<MT4> )
3524 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
3525 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
3526 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
3527
3528 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
3529 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
3530
3531 size_t k( kbegin );
3532
3533 if( k < kpos )
3534 {
3535 SIMDType b1( B.load(k,j) );
3536 SIMDType xmm1( A.load(i ,k) * b1 );
3537 SIMDType xmm2( A.load(i+1UL,k) * b1 );
3538 SIMDType xmm3( A.load(i+2UL,k) * b1 );
3539
3540 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
3541 b1 = B.load(k,j);
3542 xmm1 += A.load(i ,k) * b1;
3543 xmm2 += A.load(i+1UL,k) * b1;
3544 xmm3 += A.load(i+2UL,k) * b1;
3545 }
3546
3547 C(i ,j) += sum( xmm1 );
3548 C(i+1UL,j) += sum( xmm2 );
3549 C(i+2UL,j) += sum( xmm3 );
3550
3551 for( ; remainder && k<kend; ++k ) {
3552 C(i ,j) += A(i ,k) * B(k,j);
3553 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
3554 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
3555 }
3556 }
3557 else
3558 {
3559 for( ; k<kend; ++k ) {
3560 C(i ,j) += A(i ,k) * B(k,j);
3561 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
3562 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
3563 }
3564 }
3565 }
3566 }
3567
3568 for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
3569 {
3570 const size_t jend( LOW ? i+2UL : N );
3571 size_t j( UPP ? i : 0UL );
3572
3573 for( ; (j+4UL) <= jend; j+=4UL )
3574 {
3575 const size_t kbegin( ( IsUpper_v<MT4> )
3576 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
3577 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
3578 const size_t kend( ( IsLower_v<MT4> )
3579 ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
3580 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
3581
3582 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
3583 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
3584
3585 size_t k( kbegin );
3586
3587 if( k < kpos )
3588 {
3589 SIMDType a1( A.load(i ,k) );
3590 SIMDType a2( A.load(i+1UL,k) );
3591 SIMDType b1( B.load(k,j ) );
3592 SIMDType b2( B.load(k,j+1UL) );
3593 SIMDType b3( B.load(k,j+2UL) );
3594 SIMDType b4( B.load(k,j+3UL) );
3595 SIMDType xmm1( a1 * b1 );
3596 SIMDType xmm2( a1 * b2 );
3597 SIMDType xmm3( a1 * b3 );
3598 SIMDType xmm4( a1 * b4 );
3599 SIMDType xmm5( a2 * b1 );
3600 SIMDType xmm6( a2 * b2 );
3601 SIMDType xmm7( a2 * b3 );
3602 SIMDType xmm8( a2 * b4 );
3603
3604 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
3605 a1 = A.load(i ,k);
3606 a2 = A.load(i+1UL,k);
3607 b1 = B.load(k,j );
3608 b2 = B.load(k,j+1UL);
3609 b3 = B.load(k,j+2UL);
3610 b4 = B.load(k,j+3UL);
3611 xmm1 += a1 * b1;
3612 xmm2 += a1 * b2;
3613 xmm3 += a1 * b3;
3614 xmm4 += a1 * b4;
3615 xmm5 += a2 * b1;
3616 xmm6 += a2 * b2;
3617 xmm7 += a2 * b3;
3618 xmm8 += a2 * b4;
3619 }
3620
3621 C(i ,j ) += sum( xmm1 );
3622 C(i ,j+1UL) += sum( xmm2 );
3623 C(i ,j+2UL) += sum( xmm3 );
3624 C(i ,j+3UL) += sum( xmm4 );
3625 C(i+1UL,j ) += sum( xmm5 );
3626 C(i+1UL,j+1UL) += sum( xmm6 );
3627 C(i+1UL,j+2UL) += sum( xmm7 );
3628 C(i+1UL,j+3UL) += sum( xmm8 );
3629
3630 for( ; remainder && k<kend; ++k ) {
3631 C(i ,j ) += A(i ,k) * B(k,j );
3632 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3633 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
3634 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
3635 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3636 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3637 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
3638 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
3639 }
3640 }
3641 else
3642 {
3643 for( ; k<kend; ++k ) {
3644 C(i ,j ) += A(i ,k) * B(k,j );
3645 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3646 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
3647 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
3648 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3649 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3650 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
3651 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
3652 }
3653 }
3654 }
3655
3656 for( ; (j+2UL) <= jend; j+=2UL )
3657 {
3658 const size_t kbegin( ( IsUpper_v<MT4> )
3659 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
3660 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
3661 const size_t kend( ( IsLower_v<MT4> )
3662 ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
3663 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3664
3665 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
3666 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
3667
3668 size_t k( kbegin );
3669
3670 if( k < kpos )
3671 {
3672 SIMDType a1( A.load(i ,k) );
3673 SIMDType a2( A.load(i+1UL,k) );
3674 SIMDType b1( B.load(k,j ) );
3675 SIMDType b2( B.load(k,j+1UL) );
3676 SIMDType xmm1( a1 * b1 );
3677 SIMDType xmm2( a1 * b2 );
3678 SIMDType xmm3( a2 * b1 );
3679 SIMDType xmm4( a2 * b2 );
3680
3681 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
3682 a1 = A.load(i ,k);
3683 a2 = A.load(i+1UL,k);
3684 b1 = B.load(k,j );
3685 b2 = B.load(k,j+1UL);
3686 xmm1 += a1 * b1;
3687 xmm2 += a1 * b2;
3688 xmm3 += a2 * b1;
3689 xmm4 += a2 * b2;
3690 }
3691
3692 C(i ,j ) += sum( xmm1 );
3693 C(i ,j+1UL) += sum( xmm2 );
3694 C(i+1UL,j ) += sum( xmm3 );
3695 C(i+1UL,j+1UL) += sum( xmm4 );
3696
3697 for( ; remainder && k<kend; ++k ) {
3698 C(i ,j ) += A(i ,k) * B(k,j );
3699 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3700 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3701 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3702 }
3703 }
3704 else
3705 {
3706 for( ; k<kend; ++k ) {
3707 C(i ,j ) += A(i ,k) * B(k,j );
3708 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3709 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3710 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3711 }
3712 }
3713 }
3714
3715 if( j < jend )
3716 {
3717 const size_t kbegin( ( IsUpper_v<MT4> )
3718 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
3719 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
3720 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
3721
3722 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
3723 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
3724
3725 size_t k( kbegin );
3726
3727 if( k < kpos )
3728 {
3729 SIMDType b1( B.load(k,j) );
3730 SIMDType xmm1( A.load(i ,k) * b1 );
3731 SIMDType xmm2( A.load(i+1UL,k) * b1 );
3732
3733 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
3734 b1 = B.load(k,j);
3735 xmm1 += A.load(i ,k) * b1;
3736 xmm2 += A.load(i+1UL,k) * b1;
3737 }
3738
3739 C(i ,j) += sum( xmm1 );
3740 C(i+1UL,j) += sum( xmm2 );
3741
3742 for( ; remainder && k<kend; ++k ) {
3743 C(i ,j) += A(i ,k) * B(k,j);
3744 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
3745 }
3746 }
3747 else
3748 {
3749 for( ; k<kend; ++k ) {
3750 C(i ,j) += A(i ,k) * B(k,j);
3751 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
3752 }
3753 }
3754 }
3755 }
3756
3757 for( ; i<M; ++i )
3758 {
3759 const size_t jend( LOW ? i+1UL : N );
3760 size_t j( UPP ? i : 0UL );
3761
3762 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
3763 {
3764 const size_t kbegin( ( IsUpper_v<MT4> )
3765 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
3766 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
3767 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
3768
3769 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
3770 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
3771
3772 size_t k( kbegin );
3773
3774 if( k < kpos )
3775 {
3776 SIMDType a1( A.load(i,k) );
3777 SIMDType xmm1( a1 * B.load(k,j ) );
3778 SIMDType xmm2( a1 * B.load(k,j+1UL) );
3779 SIMDType xmm3( a1 * B.load(k,j+2UL) );
3780 SIMDType xmm4( a1 * B.load(k,j+3UL) );
3781
3782 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
3783 a1 = A.load(i,k);
3784 xmm1 += a1 * B.load(k,j );
3785 xmm2 += a1 * B.load(k,j+1UL);
3786 xmm3 += a1 * B.load(k,j+2UL);
3787 xmm4 += a1 * B.load(k,j+3UL);
3788 }
3789
3790 C(i,j ) += sum( xmm1 );
3791 C(i,j+1UL) += sum( xmm2 );
3792 C(i,j+2UL) += sum( xmm3 );
3793 C(i,j+3UL) += sum( xmm4 );
3794
3795 for( ; remainder && k<kend; ++k ) {
3796 C(i,j ) += A(i,k) * B(k,j );
3797 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
3798 C(i,j+2UL) += A(i,k) * B(k,j+2UL);
3799 C(i,j+3UL) += A(i,k) * B(k,j+3UL);
3800 }
3801 }
3802 else
3803 {
3804 for( ; k<kend; ++k ) {
3805 C(i,j ) += A(i,k) * B(k,j );
3806 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
3807 C(i,j+2UL) += A(i,k) * B(k,j+2UL);
3808 C(i,j+3UL) += A(i,k) * B(k,j+3UL);
3809 }
3810 }
3811 }
3812
3813 for( ; (j+2UL) <= jend; j+=2UL )
3814 {
3815 const size_t kbegin( ( IsUpper_v<MT4> )
3816 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
3817 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
3818 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
3819
3820 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
3821 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
3822
3823 size_t k( kbegin );
3824
3825 if( k < kpos )
3826 {
3827 SIMDType a1( A.load(i,k) );
3828 SIMDType xmm1( a1 * B.load(k,j ) );
3829 SIMDType xmm2( a1 * B.load(k,j+1UL) );
3830
3831 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
3832 a1 = A.load(i,k);
3833 xmm1 += a1 * B.load(k,j );
3834 xmm2 += a1 * B.load(k,j+1UL);
3835 }
3836
3837 C(i,j ) += sum( xmm1 );
3838 C(i,j+1UL) += sum( xmm2 );
3839
3840 for( ; remainder && k<kend; ++k ) {
3841 C(i,j ) += A(i,k) * B(k,j );
3842 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
3843 }
3844 }
3845 else
3846 {
3847 for( ; k<kend; ++k ) {
3848 C(i,j ) += A(i,k) * B(k,j );
3849 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
3850 }
3851 }
3852 }
3853
3854 if( j < jend )
3855 {
3856 const size_t kbegin( ( IsUpper_v<MT4> )
3857 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
3858 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
3859
3860 const size_t kpos( remainder ? prevMultiple( K, SIMDSIZE ) : K );
3861 BLAZE_INTERNAL_ASSERT( kpos <= K, "Invalid end calculation" );
3862
3863 size_t k( kbegin );
3864
3865 if( k < kpos )
3866 {
3867 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
3868
3869 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
3870 xmm1 += A.load(i,k) * B.load(k,j);
3871 }
3872
3873 C(i,j) += sum( xmm1 );
3874
3875 for( ; remainder && k<K; ++k ) {
3876 C(i,j) += A(i,k) * B(k,j);
3877 }
3878 }
3879 else
3880 {
3881 for( ; k<K; ++k ) {
3882 C(i,j) += A(i,k) * B(k,j);
3883 }
3884 }
3885 }
3886 }
3887 }
3889 //**********************************************************************************************
3890
3891 //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
3906 template< typename MT3 // Type of the left-hand side target matrix
3907 , typename MT4 // Type of the left-hand side matrix operand
3908 , typename MT5 > // Type of the right-hand side matrix operand
3909 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3910 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3911 {
3912 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
3913
3914 const size_t M( A.rows() );
3915 const size_t N( B.columns() );
3916 const size_t K( A.columns() );
3917
3918 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3919
3920 size_t i( 0UL );
3921
3922 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
3923 {
3924 size_t j( 0UL );
3925
3926 for( ; (j+2UL) <= N; j+=2UL )
3927 {
3928 const size_t kbegin( ( IsUpper_v<MT4> )
3929 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
3930 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
3931 const size_t kend( ( IsLower_v<MT4> )
3932 ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
3933 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3934
3935 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
3936 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
3937
3938 size_t k( kbegin );
3939
3940 if( k < kpos )
3941 {
3942 SIMDType a1( A.load(i ,k) );
3943 SIMDType a2( A.load(i+1UL,k) );
3944 SIMDType a3( A.load(i+2UL,k) );
3945 SIMDType a4( A.load(i+3UL,k) );
3946 SIMDType b1( B.load(k,j ) );
3947 SIMDType b2( B.load(k,j+1UL) );
3948 SIMDType xmm1( a1 * b1 );
3949 SIMDType xmm2( a1 * b2 );
3950 SIMDType xmm3( a2 * b1 );
3951 SIMDType xmm4( a2 * b2 );
3952 SIMDType xmm5( a3 * b1 );
3953 SIMDType xmm6( a3 * b2 );
3954 SIMDType xmm7( a4 * b1 );
3955 SIMDType xmm8( a4 * b2 );
3956
3957 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
3958 a1 = A.load(i ,k);
3959 a2 = A.load(i+1UL,k);
3960 a3 = A.load(i+2UL,k);
3961 a4 = A.load(i+3UL,k);
3962 b1 = B.load(k,j );
3963 b2 = B.load(k,j+1UL);
3964 xmm1 += a1 * b1;
3965 xmm2 += a1 * b2;
3966 xmm3 += a2 * b1;
3967 xmm4 += a2 * b2;
3968 xmm5 += a3 * b1;
3969 xmm6 += a3 * b2;
3970 xmm7 += a4 * b1;
3971 xmm8 += a4 * b2;
3972 }
3973
3974 C(i ,j ) += sum( xmm1 );
3975 C(i ,j+1UL) += sum( xmm2 );
3976 C(i+1UL,j ) += sum( xmm3 );
3977 C(i+1UL,j+1UL) += sum( xmm4 );
3978 C(i+2UL,j ) += sum( xmm5 );
3979 C(i+2UL,j+1UL) += sum( xmm6 );
3980 C(i+3UL,j ) += sum( xmm7 );
3981 C(i+3UL,j+1UL) += sum( xmm8 );
3982
3983 for( ; remainder && k<kend; ++k ) {
3984 C(i ,j ) += A(i ,k) * B(k,j );
3985 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3986 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3987 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3988 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
3989 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
3990 C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
3991 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
3992 }
3993 }
3994 else
3995 {
3996 for( ; k<kend; ++k ) {
3997 C(i ,j ) += A(i ,k) * B(k,j );
3998 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3999 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4000 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4001 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
4002 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
4003 C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
4004 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
4005 }
4006 }
4007 }
4008
4009 if( j < N )
4010 {
4011 const size_t kbegin( ( IsUpper_v<MT4> )
4012 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
4013 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
4014 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
4015
4016 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
4017 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
4018
4019 size_t k( kbegin );
4020
4021 if( k < kpos )
4022 {
4023 SIMDType b1( B.load(k,j) );
4024 SIMDType xmm1( A.load(i ,k) * b1 );
4025 SIMDType xmm2( A.load(i+1UL,k) * b1 );
4026 SIMDType xmm3( A.load(i+2UL,k) * b1 );
4027 SIMDType xmm4( A.load(i+3UL,k) * b1 );
4028
4029 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
4030 b1 = B.load(k,j);
4031 xmm1 += A.load(i ,k) * b1;
4032 xmm2 += A.load(i+1UL,k) * b1;
4033 xmm3 += A.load(i+2UL,k) * b1;
4034 xmm4 += A.load(i+3UL,k) * b1;
4035 }
4036
4037 C(i ,j) += sum( xmm1 );
4038 C(i+1UL,j) += sum( xmm2 );
4039 C(i+2UL,j) += sum( xmm3 );
4040 C(i+3UL,j) += sum( xmm4 );
4041
4042 for( ; remainder && k<kend; ++k ) {
4043 C(i ,j) += A(i ,k) * B(k,j);
4044 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
4045 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
4046 C(i+3UL,j) += A(i+3UL,k) * B(k,j);
4047 }
4048 }
4049 else
4050 {
4051 for( ; k<kend; ++k ) {
4052 C(i ,j) += A(i ,k) * B(k,j);
4053 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
4054 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
4055 C(i+3UL,j) += A(i+3UL,k) * B(k,j);
4056 }
4057 }
4058 }
4059 }
4060
4061 for( ; !LOW && !UPP && (i+3UL) <= M; i+=3UL )
4062 {
4063 size_t j( 0UL );
4064
4065 for( ; (j+3UL) <= N; j+=3UL )
4066 {
4067 const size_t kbegin( ( IsUpper_v<MT4> )
4068 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
4069 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
4070 const size_t kend( ( IsLower_v<MT4> )
4071 ?( IsUpper_v<MT5> ? min( i+3UL, j+3UL ) : ( i+3UL ) )
4072 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
4073
4074 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
4075 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
4076
4077 size_t k( kbegin );
4078
4079 if( k < kpos )
4080 {
4081 SIMDType a1( A.load(i ,k) );
4082 SIMDType a2( A.load(i+1UL,k) );
4083 SIMDType a3( A.load(i+2UL,k) );
4084 SIMDType b1( B.load(k,j ) );
4085 SIMDType b2( B.load(k,j+1UL) );
4086 SIMDType b3( B.load(k,j+2UL) );
4087 SIMDType xmm1( a1 * b1 );
4088 SIMDType xmm2( a1 * b2 );
4089 SIMDType xmm3( a1 * b3 );
4090 SIMDType xmm4( a2 * b1 );
4091 SIMDType xmm5( a2 * b2 );
4092 SIMDType xmm6( a2 * b3 );
4093 SIMDType xmm7( a3 * b1 );
4094 SIMDType xmm8( a3 * b2 );
4095 SIMDType xmm9( a3 * b3 );
4096
4097 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
4098 a1 = A.load(i ,k);
4099 a2 = A.load(i+1UL,k);
4100 a3 = A.load(i+2UL,k);
4101 b1 = B.load(k,j );
4102 b2 = B.load(k,j+1UL);
4103 b3 = B.load(k,j+2UL);
4104 xmm1 += a1 * b1;
4105 xmm2 += a1 * b2;
4106 xmm3 += a1 * b3;
4107 xmm4 += a2 * b1;
4108 xmm5 += a2 * b2;
4109 xmm6 += a2 * b3;
4110 xmm7 += a3 * b1;
4111 xmm8 += a3 * b2;
4112 xmm9 += a3 * b3;
4113 }
4114
4115 C(i ,j ) += sum( xmm1 );
4116 C(i ,j+1UL) += sum( xmm2 );
4117 C(i ,j+2UL) += sum( xmm3 );
4118 C(i+1UL,j ) += sum( xmm4 );
4119 C(i+1UL,j+1UL) += sum( xmm5 );
4120 C(i+1UL,j+2UL) += sum( xmm6 );
4121 C(i+2UL,j ) += sum( xmm7 );
4122 C(i+2UL,j+1UL) += sum( xmm8 );
4123 C(i+2UL,j+2UL) += sum( xmm9 );
4124
4125 for( ; remainder && k<kend; ++k ) {
4126 C(i ,j ) += A(i ,k) * B(k,j );
4127 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
4128 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
4129 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4130 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4131 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
4132 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
4133 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
4134 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL);
4135 }
4136 }
4137 else
4138 {
4139 for( ; k<kend; ++k ) {
4140 C(i ,j ) += A(i ,k) * B(k,j );
4141 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
4142 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
4143 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4144 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4145 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
4146 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
4147 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
4148 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL);
4149 }
4150 }
4151 }
4152
4153 for( ; (j+2UL) <= N; j+=2UL )
4154 {
4155 const size_t kbegin( ( IsUpper_v<MT4> )
4156 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
4157 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
4158 const size_t kend( ( IsLower_v<MT4> )
4159 ?( IsUpper_v<MT5> ? min( i+3UL, j+2UL ) : ( i+3UL ) )
4160 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
4161
4162 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
4163 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
4164
4165 size_t k( kbegin );
4166
4167 if( k < kpos )
4168 {
4169 SIMDType a1( A.load(i ,k) );
4170 SIMDType a2( A.load(i+1UL,k) );
4171 SIMDType a3( A.load(i+2UL,k) );
4172 SIMDType b1( B.load(k,j ) );
4173 SIMDType b2( B.load(k,j+1UL) );
4174 SIMDType xmm1( a1 * b1 );
4175 SIMDType xmm2( a1 * b2 );
4176 SIMDType xmm3( a2 * b1 );
4177 SIMDType xmm4( a2 * b2 );
4178 SIMDType xmm5( a3 * b1 );
4179 SIMDType xmm6( a3 * b2 );
4180
4181 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
4182 a1 = A.load(i ,k);
4183 a2 = A.load(i+1UL,k);
4184 a3 = A.load(i+2UL,k);
4185 b1 = B.load(k,j );
4186 b2 = B.load(k,j+1UL);
4187 xmm1 += a1 * b1;
4188 xmm2 += a1 * b2;
4189 xmm3 += a2 * b1;
4190 xmm4 += a2 * b2;
4191 xmm5 += a3 * b1;
4192 xmm6 += a3 * b2;
4193 }
4194
4195 C(i ,j ) += sum( xmm1 );
4196 C(i ,j+1UL) += sum( xmm2 );
4197 C(i+1UL,j ) += sum( xmm3 );
4198 C(i+1UL,j+1UL) += sum( xmm4 );
4199 C(i+2UL,j ) += sum( xmm5 );
4200 C(i+2UL,j+1UL) += sum( xmm6 );
4201
4202 for( ; remainder && k<kend; ++k ) {
4203 C(i ,j ) += A(i ,k) * B(k,j );
4204 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
4205 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4206 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4207 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
4208 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
4209 }
4210 }
4211 else
4212 {
4213 for( ; k<kend; ++k ) {
4214 C(i ,j ) += A(i ,k) * B(k,j );
4215 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
4216 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4217 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4218 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
4219 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
4220 }
4221 }
4222 }
4223
4224 if( j < N )
4225 {
4226 const size_t kbegin( ( IsUpper_v<MT4> )
4227 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
4228 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
4229 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
4230
4231 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
4232 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
4233
4234 size_t k( kbegin );
4235
4236 if( k < kpos )
4237 {
4238 SIMDType b1( B.load(k,j) );
4239 SIMDType xmm1( A.load(i ,k) * b1 );
4240 SIMDType xmm2( A.load(i+1UL,k) * b1 );
4241 SIMDType xmm3( A.load(i+2UL,k) * b1 );
4242
4243 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
4244 b1 = B.load(k,j);
4245 xmm1 += A.load(i ,k) * b1;
4246 xmm2 += A.load(i+1UL,k) * b1;
4247 xmm3 += A.load(i+2UL,k) * b1;
4248 }
4249
4250 C(i ,j) += sum( xmm1 );
4251 C(i+1UL,j) += sum( xmm2 );
4252 C(i+2UL,j) += sum( xmm3 );
4253
4254 for( ; remainder && k<kend; ++k ) {
4255 C(i ,j) += A(i ,k) * B(k,j);
4256 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
4257 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
4258 }
4259 }
4260 else
4261 {
4262 for( ; k<kend; ++k ) {
4263 C(i ,j) += A(i ,k) * B(k,j);
4264 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
4265 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
4266 }
4267 }
4268 }
4269 }
4270
4271 for( ; (i+2UL) <= M; i+=2UL )
4272 {
4273 const size_t jend( LOW ? i+2UL : N );
4274 size_t j( UPP ? i : 0UL );
4275
4276 for( ; (j+2UL) <= jend; j+=2UL )
4277 {
4278 const size_t kbegin( ( IsUpper_v<MT4> )
4279 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
4280 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
4281 const size_t kend( ( IsLower_v<MT4> )
4282 ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
4283 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
4284
4285 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
4286 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
4287
4288 size_t k( kbegin );
4289
4290 if( k < kpos )
4291 {
4292 SIMDType a1( A.load(i ,k) );
4293 SIMDType a2( A.load(i+1UL,k) );
4294 SIMDType b1( B.load(k,j ) );
4295 SIMDType b2( B.load(k,j+1UL) );
4296 SIMDType xmm1( a1 * b1 );
4297 SIMDType xmm2( a1 * b2 );
4298 SIMDType xmm3( a2 * b1 );
4299 SIMDType xmm4( a2 * b2 );
4300
4301 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
4302 a1 = A.load(i ,k);
4303 a2 = A.load(i+1UL,k);
4304 b1 = B.load(k,j );
4305 b2 = B.load(k,j+1UL);
4306 xmm1 += a1 * b1;
4307 xmm2 += a1 * b2;
4308 xmm3 += a2 * b1;
4309 xmm4 += a2 * b2;
4310 }
4311
4312 C(i ,j ) += sum( xmm1 );
4313 C(i ,j+1UL) += sum( xmm2 );
4314 C(i+1UL,j ) += sum( xmm3 );
4315 C(i+1UL,j+1UL) += sum( xmm4 );
4316
4317 for( ; remainder && k<kend; ++k ) {
4318 C(i ,j ) += A(i ,k) * B(k,j );
4319 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
4320 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4321 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4322 }
4323 }
4324 else
4325 {
4326 for( ; k<kend; ++k ) {
4327 C(i ,j ) += A(i ,k) * B(k,j );
4328 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
4329 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4330 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4331 }
4332 }
4333 }
4334
4335 if( j < jend )
4336 {
4337 const size_t kbegin( ( IsUpper_v<MT4> )
4338 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
4339 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
4340 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
4341
4342 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
4343 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
4344
4345 size_t k( kbegin );
4346
4347 if( k < kpos )
4348 {
4349 SIMDType b1( B.load(k,j) );
4350 SIMDType xmm1( A.load(i ,k) * b1 );
4351 SIMDType xmm2( A.load(i+1UL,k) * b1 );
4352
4353 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
4354 b1 = B.load(k,j);
4355 xmm1 += A.load(i ,k) * b1;
4356 xmm2 += A.load(i+1UL,k) * b1;
4357 }
4358
4359 C(i ,j) += sum( xmm1 );
4360 C(i+1UL,j) += sum( xmm2 );
4361
4362 for( ; remainder && k<kend; ++k ) {
4363 C(i ,j) += A(i ,k) * B(k,j);
4364 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
4365 }
4366 }
4367 else
4368 {
4369 for( ; k<kend; ++k ) {
4370 C(i ,j) += A(i ,k) * B(k,j);
4371 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
4372 }
4373 }
4374 }
4375 }
4376
4377 if( i < M )
4378 {
4379 const size_t jend( LOW ? i+1UL : N );
4380 size_t j( UPP ? i : 0UL );
4381
4382 for( ; (j+2UL) <= jend; j+=2UL )
4383 {
4384 const size_t kbegin( ( IsUpper_v<MT4> )
4385 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
4386 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
4387 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
4388
4389 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
4390 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
4391
4392 size_t k( kbegin );
4393
4394 if( k < kpos )
4395 {
4396 SIMDType a1( A.load(i,k) );
4397 SIMDType xmm1( a1 * B.load(k,j ) );
4398 SIMDType xmm2( a1 * B.load(k,j+1UL) );
4399
4400 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
4401 a1 = A.load(i,k);
4402 xmm1 += a1 * B.load(k,j );
4403 xmm2 += a1 * B.load(k,j+1UL);
4404 }
4405
4406 C(i,j ) += sum( xmm1 );
4407 C(i,j+1UL) += sum( xmm2 );
4408
4409 for( ; remainder && k<kend; ++k ) {
4410 C(i,j ) += A(i,k) * B(k,j );
4411 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
4412 }
4413 }
4414 else
4415 {
4416 for( ; k<kend; ++k ) {
4417 C(i,j ) += A(i,k) * B(k,j );
4418 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
4419 }
4420 }
4421 }
4422
4423 if( j < jend )
4424 {
4425 const size_t kbegin( ( IsUpper_v<MT4> )
4426 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
4427 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
4428
4429 const size_t kpos( remainder ? prevMultiple( K, SIMDSIZE ) : K );
4430 BLAZE_INTERNAL_ASSERT( kpos <= K, "Invalid end calculation" );
4431
4432 size_t k( kbegin );
4433
4434 if( k < kpos )
4435 {
4436 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
4437
4438 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
4439 xmm1 += A.load(i,k) * B.load(k,j);
4440 }
4441
4442 C(i,j) += sum( xmm1 );
4443
4444 for( ; remainder && k<K; ++k ) {
4445 C(i,j) += A(i,k) * B(k,j);
4446 }
4447 }
4448 else
4449 {
4450 for( ; k<K; ++k ) {
4451 C(i,j) += A(i,k) * B(k,j);
4452 }
4453 }
4454 }
4455 }
4456 }
4458 //**********************************************************************************************
4459
4460 //**Default addition assignment to dense matrices (large matrices)******************************
4474 template< typename MT3 // Type of the left-hand side target matrix
4475 , typename MT4 // Type of the left-hand side matrix operand
4476 , typename MT5 > // Type of the right-hand side matrix operand
4477 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4478 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4479 {
4480 selectDefaultAddAssignKernel( C, A, B );
4481 }
4483 //**********************************************************************************************
4484
4485 //**Vectorized default addition assignment to dense matrices (large matrices)*******************
4500 template< typename MT3 // Type of the left-hand side target matrix
4501 , typename MT4 // Type of the left-hand side matrix operand
4502 , typename MT5 > // Type of the right-hand side matrix operand
4503 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4504 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4505 {
4506 if( LOW )
4507 lmmm( C, A, B, ElementType(1), ElementType(1) );
4508 else if( UPP )
4509 ummm( C, A, B, ElementType(1), ElementType(1) );
4510 else
4511 mmm( C, A, B, ElementType(1), ElementType(1) );
4512 }
4514 //**********************************************************************************************
4515
4516 //**BLAS-based addition assignment to dense matrices (default)**********************************
4530 template< typename MT3 // Type of the left-hand side target matrix
4531 , typename MT4 // Type of the left-hand side matrix operand
4532 , typename MT5 > // Type of the right-hand side matrix operand
4533 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4534 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4535 {
4536 selectLargeAddAssignKernel( C, A, B );
4537 }
4539 //**********************************************************************************************
4540
4541 //**BLAS-based addition assignment to dense matrices********************************************
4542#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4556 template< typename MT3 // Type of the left-hand side target matrix
4557 , typename MT4 // Type of the left-hand side matrix operand
4558 , typename MT5 > // Type of the right-hand side matrix operand
4559 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4560 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4561 {
4562 using ET = ElementType_t<MT3>;
4563
4564 if( IsTriangular_v<MT4> ) {
4565 ResultType_t<MT3> tmp( serial( B ) );
4566 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4567 addAssign( C, tmp );
4568 }
4569 else if( IsTriangular_v<MT5> ) {
4570 ResultType_t<MT3> tmp( serial( A ) );
4571 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4572 addAssign( C, tmp );
4573 }
4574 else {
4575 gemm( C, A, B, ET(1), ET(1) );
4576 }
4577 }
4579#endif
4580 //**********************************************************************************************
4581
4582 //**Addition assignment to sparse matrices******************************************************
4583 // No special implementation for the addition assignment to sparse matrices.
4584 //**********************************************************************************************
4585
4586 //**Subtraction assignment to dense matrices****************************************************
4599 template< typename MT // Type of the target dense matrix
4600 , bool SO > // Storage order of the target dense matrix
4601 friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4602 {
4604
4605 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
4606 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
4607
4608 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4609 return;
4610 }
4611
4612 LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
4613 RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
4614
4615 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4616 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4617 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4618 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4619 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
4620 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
4621
4622 DMatTDMatMultExpr::selectSubAssignKernel( *lhs, A, B );
4623 }
4625 //**********************************************************************************************
4626
4627 //**Subtraction assignment to dense matrices (kernel selection)*********************************
4638 template< typename MT3 // Type of the left-hand side target matrix
4639 , typename MT4 // Type of the left-hand side matrix operand
4640 , typename MT5 > // Type of the right-hand side matrix operand
4641 static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4642 {
4643 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
4644 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
4645 selectSmallSubAssignKernel( C, A, B );
4646 else
4647 selectBlasSubAssignKernel( C, A, B );
4648 }
4650 //**********************************************************************************************
4651
4652 //**Default subtraction assignment to row-major dense matrices (general/general)****************
4666 template< typename MT3 // Type of the left-hand side target matrix
4667 , typename MT4 // Type of the left-hand side matrix operand
4668 , typename MT5 > // Type of the right-hand side matrix operand
4669 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4670 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4671 {
4672 const size_t M( A.rows() );
4673 const size_t N( B.columns() );
4674 const size_t K( A.columns() );
4675
4676 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4677
4678 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
4679 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
4680 :( 0UL ) );
4681 const size_t iend( ( IsStrictlyUpper_v<MT4> )
4682 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
4683 :( M ) );
4684 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4685
4686 for( size_t i=ibegin; i<iend; ++i )
4687 {
4688 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
4689 ?( ( IsStrictlyUpper_v<MT4> )
4690 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
4691 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
4692 :( ( IsStrictlyUpper_v<MT5> )
4693 ?( UPP ? max( i, 1UL ) : 1UL )
4694 :( UPP ? i : 0UL ) ) );
4695 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
4696 ?( ( IsStrictlyLower_v<MT4> )
4697 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
4698 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
4699 :( ( IsStrictlyLower_v<MT5> )
4700 ?( LOW ? min(i+1UL,N-1UL) : N-1UL )
4701 :( LOW ? i+1UL : N ) ) );
4702
4703 if( ( LOW || UPP ) && ( jbegin > jend ) ) continue;
4704 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4705
4706 for( size_t j=jbegin; j<jend; ++j )
4707 {
4708 const size_t kbegin( ( IsUpper_v<MT4> )
4709 ?( ( IsLower_v<MT5> )
4710 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4711 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4712 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4713 :( ( IsLower_v<MT5> )
4714 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4715 :( 0UL ) ) );
4716 const size_t kend( ( IsLower_v<MT4> )
4717 ?( ( IsUpper_v<MT5> )
4718 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
4719 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4720 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
4721 :( ( IsUpper_v<MT5> )
4722 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4723 :( K ) ) );
4724 BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
4725
4726 const size_t knum( kend - kbegin );
4727 const size_t kpos( kbegin + prevMultiple( knum, 2UL ) );
4728 BLAZE_INTERNAL_ASSERT( kpos <= kbegin+knum, "Invalid end calculation" );
4729
4730 for( size_t k=kbegin; k<kpos; k+=2UL ) {
4731 C(i,j) -= A(i,k ) * B(k ,j);
4732 C(i,j) -= A(i,k+1UL) * B(k+1UL,j);
4733 }
4734 if( kpos < kend ) {
4735 C(i,j) -= A(i,kpos) * B(kpos,j);
4736 }
4737 }
4738 }
4739 }
4741 //**********************************************************************************************
4742
4743 //**Default subtraction assignment to column-major dense matrices (general/general)*************
4757 template< typename MT3 // Type of the left-hand side target matrix
4758 , typename MT4 // Type of the left-hand side matrix operand
4759 , typename MT5 > // Type of the right-hand side matrix operand
4760 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4761 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4762 {
4763 const size_t M( A.rows() );
4764 const size_t N( B.columns() );
4765 const size_t K( A.columns() );
4766
4767 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4768
4769 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
4770 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
4771 :( 0UL ) );
4772 const size_t jend( ( IsStrictlyLower_v<MT5> )
4773 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
4774 :( N ) );
4775 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4776
4777 for( size_t j=jbegin; j<jend; ++j )
4778 {
4779 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
4780 ?( ( IsStrictlyLower_v<MT4> )
4781 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
4782 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4783 :( ( IsStrictlyLower_v<MT4> )
4784 ?( LOW ? max( j, 1UL ) : 1UL )
4785 :( LOW ? j : 0UL ) ) );
4786 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
4787 ?( ( IsStrictlyUpper_v<MT4> )
4788 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
4789 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
4790 :( ( IsStrictlyUpper_v<MT4> )
4791 ?( UPP ? min(j+1UL,M-1UL) : M-1UL )
4792 :( UPP ? j+1UL : M ) ) );
4793
4794 if( ( LOW || UPP ) && ( ibegin > iend ) ) continue;
4795 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4796
4797 for( size_t i=ibegin; i<iend; ++i )
4798 {
4799 const size_t kbegin( ( IsUpper_v<MT4> )
4800 ?( ( IsLower_v<MT5> )
4801 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4802 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4803 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4804 :( ( IsLower_v<MT5> )
4805 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4806 :( 0UL ) ) );
4807 const size_t kend( ( IsLower_v<MT4> )
4808 ?( ( IsUpper_v<MT5> )
4809 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
4810 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4811 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
4812 :( ( IsUpper_v<MT5> )
4813 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4814 :( K ) ) );
4815 BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
4816
4817 const size_t knum( kend - kbegin );
4818 const size_t kpos( kbegin + prevMultiple( knum, 2UL ) );
4819 BLAZE_INTERNAL_ASSERT( kpos <= kbegin+knum, "Invalid end calculation" );
4820
4821 for( size_t k=kbegin; k<kpos; k+=2UL ) {
4822 C(i,j) -= A(i,k ) * B(k ,j);
4823 C(i,j) -= A(i,k+1UL) * B(k+1UL,j);
4824 }
4825 if( kpos < kend ) {
4826 C(i,j) -= A(i,kpos) * B(kpos,j);
4827 }
4828 }
4829 }
4830 }
4832 //**********************************************************************************************
4833
4834 //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
4848 template< typename MT3 // Type of the left-hand side target matrix
4849 , typename MT4 // Type of the left-hand side matrix operand
4850 , typename MT5 > // Type of the right-hand side matrix operand
4851 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4852 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4853 {
4854 const size_t M( A.rows() );
4855 const size_t N( B.columns() );
4856
4857 for( size_t i=0UL; i<M; ++i )
4858 {
4859 const size_t jbegin( ( IsUpper_v<MT4> )
4860 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4861 :( 0UL ) );
4862 const size_t jend( ( IsLower_v<MT4> )
4863 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
4864 :( N ) );
4865 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4866
4867 const size_t jnum( jend - jbegin );
4868 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
4869 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
4870
4871 for( size_t j=jbegin; j<jpos; j+=2UL ) {
4872 C(i,j ) -= A(i,j ) * B(j ,j );
4873 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
4874 }
4875 if( jpos < jend ) {
4876 C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
4877 }
4878 }
4879 }
4881 //**********************************************************************************************
4882
4883 //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
4897 template< typename MT3 // Type of the left-hand side target matrix
4898 , typename MT4 // Type of the left-hand side matrix operand
4899 , typename MT5 > // Type of the right-hand side matrix operand
4900 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4901 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4902 {
4903 constexpr size_t block( BLOCK_SIZE );
4904
4905 const size_t M( A.rows() );
4906 const size_t N( B.columns() );
4907
4908 for( size_t jj=0UL; jj<N; jj+=block ) {
4909 const size_t jend( min( N, jj+block ) );
4910 for( size_t ii=0UL; ii<M; ii+=block ) {
4911 const size_t iend( min( M, ii+block ) );
4912 for( size_t j=jj; j<jend; ++j )
4913 {
4914 const size_t ibegin( ( IsLower_v<MT4> )
4915 ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
4916 :( ii ) );
4917 const size_t ipos( ( IsUpper_v<MT4> )
4918 ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
4919 :( iend ) );
4920
4921 for( size_t i=ibegin; i<ipos; ++i ) {
4922 C(i,j) -= A(i,j) * B(j,j);
4923 }
4924 }
4925 }
4926 }
4927 }
4929 //**********************************************************************************************
4930
4931 //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
4945 template< typename MT3 // Type of the left-hand side target matrix
4946 , typename MT4 // Type of the left-hand side matrix operand
4947 , typename MT5 > // Type of the right-hand side matrix operand
4948 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4949 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4950 {
4951 constexpr size_t block( BLOCK_SIZE );
4952
4953 const size_t M( A.rows() );
4954 const size_t N( B.columns() );
4955
4956 for( size_t ii=0UL; ii<M; ii+=block ) {
4957 const size_t iend( min( M, ii+block ) );
4958 for( size_t jj=0UL; jj<N; jj+=block ) {
4959 const size_t jend( min( N, jj+block ) );
4960 for( size_t i=ii; i<iend; ++i )
4961 {
4962 const size_t jbegin( ( IsUpper_v<MT5> )
4963 ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
4964 :( jj ) );
4965 const size_t jpos( ( IsLower_v<MT5> )
4966 ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
4967 :( jend ) );
4968
4969 for( size_t j=jbegin; j<jpos; ++j ) {
4970 C(i,j) -= A(i,i) * B(i,j);
4971 }
4972 }
4973 }
4974 }
4975 }
4977 //**********************************************************************************************
4978
4979 //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
4993 template< typename MT3 // Type of the left-hand side target matrix
4994 , typename MT4 // Type of the left-hand side matrix operand
4995 , typename MT5 > // Type of the right-hand side matrix operand
4996 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4997 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4998 {
4999 const size_t M( A.rows() );
5000 const size_t N( B.columns() );
5001
5002 for( size_t j=0UL; j<N; ++j )
5003 {
5004 const size_t ibegin( ( IsLower_v<MT5> )
5005 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5006 :( 0UL ) );
5007 const size_t iend( ( IsUpper_v<MT5> )
5008 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5009 :( M ) );
5010 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5011
5012 const size_t inum( iend - ibegin );
5013 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
5014 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
5015
5016 for( size_t i=ibegin; i<ipos; i+=2UL ) {
5017 C(i ,j) -= A(i ,i ) * B(i ,j);
5018 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
5019 }
5020 if( ipos < iend ) {
5021 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
5022 }
5023 }
5024 }
5026 //**********************************************************************************************
5027
5028 //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
5042 template< typename MT3 // Type of the left-hand side target matrix
5043 , typename MT4 // Type of the left-hand side matrix operand
5044 , typename MT5 > // Type of the right-hand side matrix operand
5045 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5046 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5047 {
5048 for( size_t i=0UL; i<A.rows(); ++i ) {
5049 C(i,i) -= A(i,i) * B(i,i);
5050 }
5051 }
5053 //**********************************************************************************************
5054
5055 //**Default subtraction assignment to dense matrices (small matrices)***************************
5069 template< typename MT3 // Type of the left-hand side target matrix
5070 , typename MT4 // Type of the left-hand side matrix operand
5071 , typename MT5 > // Type of the right-hand side matrix operand
5072 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5073 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5074 {
5075 selectDefaultSubAssignKernel( C, A, B );
5076 }
5078 //**********************************************************************************************
5079
5080 //**Default subtraction assignment to row-major dense matrices (small matrices)*****************
5095 template< typename MT3 // Type of the left-hand side target matrix
5096 , typename MT4 // Type of the left-hand side matrix operand
5097 , typename MT5 > // Type of the right-hand side matrix operand
5098 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5099 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5100 {
5101 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
5102
5103 const size_t M( A.rows() );
5104 const size_t N( B.columns() );
5105 const size_t K( A.columns() );
5106
5107 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5108
5109 size_t i( 0UL );
5110
5111 for( ; !( LOW && UPP ) && (i+3UL) <= M; i+=3UL )
5112 {
5113 const size_t jend( LOW ? i+3UL : N );
5114 size_t j( UPP ? i : 0UL );
5115
5116 for( ; (j+3UL) <= jend; j+=3UL )
5117 {
5118 const size_t kbegin( ( IsUpper_v<MT4> )
5119 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5120 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5121 const size_t kend( ( IsLower_v<MT4> )
5122 ?( IsUpper_v<MT5> ? min( i+3UL, j+3UL ) : ( i+3UL ) )
5123 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
5124
5125 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5126 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5127
5128 size_t k( kbegin );
5129
5130 if( k < kpos )
5131 {
5132 SIMDType a1( A.load(i ,k) );
5133 SIMDType a2( A.load(i+1UL,k) );
5134 SIMDType a3( A.load(i+2UL,k) );
5135 SIMDType b1( B.load(k,j ) );
5136 SIMDType b2( B.load(k,j+1UL) );
5137 SIMDType b3( B.load(k,j+2UL) );
5138 SIMDType xmm1( a1 * b1 );
5139 SIMDType xmm2( a1 * b2 );
5140 SIMDType xmm3( a1 * b3 );
5141 SIMDType xmm4( a2 * b1 );
5142 SIMDType xmm5( a2 * b2 );
5143 SIMDType xmm6( a2 * b3 );
5144 SIMDType xmm7( a3 * b1 );
5145 SIMDType xmm8( a3 * b2 );
5146 SIMDType xmm9( a3 * b3 );
5147
5148 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5149 a1 = A.load(i ,k);
5150 a2 = A.load(i+1UL,k);
5151 a3 = A.load(i+2UL,k);
5152 b1 = B.load(k,j );
5153 b2 = B.load(k,j+1UL);
5154 b3 = B.load(k,j+2UL);
5155 xmm1 += a1 * b1;
5156 xmm2 += a1 * b2;
5157 xmm3 += a1 * b3;
5158 xmm4 += a2 * b1;
5159 xmm5 += a2 * b2;
5160 xmm6 += a2 * b3;
5161 xmm7 += a3 * b1;
5162 xmm8 += a3 * b2;
5163 xmm9 += a3 * b3;
5164 }
5165
5166 C(i ,j ) -= sum( xmm1 );
5167 C(i ,j+1UL) -= sum( xmm2 );
5168 C(i ,j+2UL) -= sum( xmm3 );
5169 C(i+1UL,j ) -= sum( xmm4 );
5170 C(i+1UL,j+1UL) -= sum( xmm5 );
5171 C(i+1UL,j+2UL) -= sum( xmm6 );
5172 C(i+2UL,j ) -= sum( xmm7 );
5173 C(i+2UL,j+1UL) -= sum( xmm8 );
5174 C(i+2UL,j+2UL) -= sum( xmm9 );
5175
5176 for( ; remainder && k<kend; ++k ) {
5177 C(i ,j ) -= A(i ,k) * B(k,j );
5178 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5179 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
5180 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5181 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5182 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
5183 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5184 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5185 C(i+2UL,j+2UL) -= A(i+2UL,k) * B(k,j+2UL);
5186 }
5187 }
5188 else
5189 {
5190 for( ; k<kend; ++k ) {
5191 C(i ,j ) -= A(i ,k) * B(k,j );
5192 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5193 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
5194 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5195 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5196 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
5197 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5198 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5199 C(i+2UL,j+2UL) -= A(i+2UL,k) * B(k,j+2UL);
5200 }
5201 }
5202 }
5203
5204 for( ; (j+2UL) <= jend; j+=2UL )
5205 {
5206 const size_t kbegin( ( IsUpper_v<MT4> )
5207 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5208 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5209 const size_t kend( ( IsLower_v<MT4> )
5210 ?( IsUpper_v<MT5> ? min( i+3UL, j+2UL ) : ( i+3UL ) )
5211 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5212
5213 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5214 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5215
5216 size_t k( kbegin );
5217
5218 if( k < kpos )
5219 {
5220 SIMDType a1( A.load(i ,k) );
5221 SIMDType a2( A.load(i+1UL,k) );
5222 SIMDType a3( A.load(i+2UL,k) );
5223 SIMDType b1( B.load(k,j ) );
5224 SIMDType b2( B.load(k,j+1UL) );
5225 SIMDType xmm1( a1 * b1 );
5226 SIMDType xmm2( a1 * b2 );
5227 SIMDType xmm3( a2 * b1 );
5228 SIMDType xmm4( a2 * b2 );
5229 SIMDType xmm5( a3 * b1 );
5230 SIMDType xmm6( a3 * b2 );
5231
5232 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5233 a1 = A.load(i ,k);
5234 a2 = A.load(i+1UL,k);
5235 a3 = A.load(i+2UL,k);
5236 b1 = B.load(k,j );
5237 b2 = B.load(k,j+1UL);
5238 xmm1 += a1 * b1;
5239 xmm2 += a1 * b2;
5240 xmm3 += a2 * b1;
5241 xmm4 += a2 * b2;
5242 xmm5 += a3 * b1;
5243 xmm6 += a3 * b2;
5244 }
5245
5246 C(i ,j ) -= sum( xmm1 );
5247 C(i ,j+1UL) -= sum( xmm2 );
5248 C(i+1UL,j ) -= sum( xmm3 );
5249 C(i+1UL,j+1UL) -= sum( xmm4 );
5250 C(i+2UL,j ) -= sum( xmm5 );
5251 C(i+2UL,j+1UL) -= sum( xmm6 );
5252
5253 for( ; remainder && k<kend; ++k ) {
5254 C(i ,j ) -= A(i ,k) * B(k,j );
5255 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5256 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5257 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5258 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5259 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5260 }
5261 }
5262 else
5263 {
5264 for( ; k<kend; ++k ) {
5265 C(i ,j ) -= A(i ,k) * B(k,j );
5266 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5267 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5268 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5269 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5270 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5271 }
5272 }
5273 }
5274
5275 if( j < jend )
5276 {
5277 const size_t kbegin( ( IsUpper_v<MT4> )
5278 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5279 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5280 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
5281
5282 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5283 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5284
5285 size_t k( kbegin );
5286
5287 if( k < kpos )
5288 {
5289 SIMDType b1( B.load(k,j) );
5290 SIMDType xmm1( A.load(i ,k) * b1 );
5291 SIMDType xmm2( A.load(i+1UL,k) * b1 );
5292 SIMDType xmm3( A.load(i+2UL,k) * b1 );
5293
5294 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5295 b1 = B.load(k,j);
5296 xmm1 += A.load(i ,k) * b1;
5297 xmm2 += A.load(i+1UL,k) * b1;
5298 xmm3 += A.load(i+2UL,k) * b1;
5299 }
5300
5301 C(i ,j) -= sum( xmm1 );
5302 C(i+1UL,j) -= sum( xmm2 );
5303 C(i+2UL,j) -= sum( xmm3 );
5304
5305 for( ; remainder && k<kend; ++k ) {
5306 C(i ,j) -= A(i ,k) * B(k,j);
5307 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5308 C(i+2UL,j) -= A(i+2UL,k) * B(k,j);
5309 }
5310 }
5311 else
5312 {
5313 for( ; k<kend; ++k ) {
5314 C(i ,j) -= A(i ,k) * B(k,j);
5315 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5316 C(i+2UL,j) -= A(i+2UL,k) * B(k,j);
5317 }
5318 }
5319 }
5320 }
5321
5322 for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
5323 {
5324 const size_t jend( LOW ? i+2UL : N );
5325 size_t j( UPP ? i : 0UL );
5326
5327 for( ; (j+4UL) <= jend; j+=4UL )
5328 {
5329 const size_t kbegin( ( IsUpper_v<MT4> )
5330 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5331 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5332 const size_t kend( ( IsLower_v<MT4> )
5333 ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
5334 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
5335
5336 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5337 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5338
5339 size_t k( kbegin );
5340
5341 if( k < kpos )
5342 {
5343 SIMDType a1( A.load(i ,k) );
5344 SIMDType a2( A.load(i+1UL,k) );
5345 SIMDType b1( B.load(k,j ) );
5346 SIMDType b2( B.load(k,j+1UL) );
5347 SIMDType b3( B.load(k,j+2UL) );
5348 SIMDType b4( B.load(k,j+3UL) );
5349 SIMDType xmm1( a1 * b1 );
5350 SIMDType xmm2( a1 * b2 );
5351 SIMDType xmm3( a1 * b3 );
5352 SIMDType xmm4( a1 * b4 );
5353 SIMDType xmm5( a2 * b1 );
5354 SIMDType xmm6( a2 * b2 );
5355 SIMDType xmm7( a2 * b3 );
5356 SIMDType xmm8( a2 * b4 );
5357
5358 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5359 a1 = A.load(i ,k);
5360 a2 = A.load(i+1UL,k);
5361 b1 = B.load(k,j );
5362 b2 = B.load(k,j+1UL);
5363 b3 = B.load(k,j+2UL);
5364 b4 = B.load(k,j+3UL);
5365 xmm1 += a1 * b1;
5366 xmm2 += a1 * b2;
5367 xmm3 += a1 * b3;
5368 xmm4 += a1 * b4;
5369 xmm5 += a2 * b1;
5370 xmm6 += a2 * b2;
5371 xmm7 += a2 * b3;
5372 xmm8 += a2 * b4;
5373 }
5374
5375 C(i ,j ) -= sum( xmm1 );
5376 C(i ,j+1UL) -= sum( xmm2 );
5377 C(i ,j+2UL) -= sum( xmm3 );
5378 C(i ,j+3UL) -= sum( xmm4 );
5379 C(i+1UL,j ) -= sum( xmm5 );
5380 C(i+1UL,j+1UL) -= sum( xmm6 );
5381 C(i+1UL,j+2UL) -= sum( xmm7 );
5382 C(i+1UL,j+3UL) -= sum( xmm8 );
5383
5384 for( ; remainder && k<kend; ++k ) {
5385 C(i ,j ) -= A(i ,k) * B(k,j );
5386 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5387 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
5388 C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
5389 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5390 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5391 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
5392 C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
5393 }
5394 }
5395 else
5396 {
5397 for( ; k<kend; ++k ) {
5398 C(i ,j ) -= A(i ,k) * B(k,j );
5399 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5400 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
5401 C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
5402 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5403 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5404 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
5405 C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
5406 }
5407 }
5408 }
5409
5410 for( ; (j+2UL) <= jend; j+=2UL )
5411 {
5412 const size_t kbegin( ( IsUpper_v<MT4> )
5413 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5414 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5415 const size_t kend( ( IsLower_v<MT4> )
5416 ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
5417 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5418
5419 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5420 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5421
5422 size_t k( kbegin );
5423
5424 if( k < kpos )
5425 {
5426 SIMDType a1( A.load(i ,k) );
5427 SIMDType a2( A.load(i+1UL,k) );
5428 SIMDType b1( B.load(k,j ) );
5429 SIMDType b2( B.load(k,j+1UL) );
5430 SIMDType xmm1( a1 * b1 );
5431 SIMDType xmm2( a1 * b2 );
5432 SIMDType xmm3( a2 * b1 );
5433 SIMDType xmm4( a2 * b2 );
5434
5435 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5436 a1 = A.load(i ,k);
5437 a2 = A.load(i+1UL,k);
5438 b1 = B.load(k,j );
5439 b2 = B.load(k,j+1UL);
5440 xmm1 += a1 * b1;
5441 xmm2 += a1 * b2;
5442 xmm3 += a2 * b1;
5443 xmm4 += a2 * b2;
5444 }
5445
5446 C(i ,j ) -= sum( xmm1 );
5447 C(i ,j+1UL) -= sum( xmm2 );
5448 C(i+1UL,j ) -= sum( xmm3 );
5449 C(i+1UL,j+1UL) -= sum( xmm4 );
5450
5451 for( ; remainder && k<kend; ++k ) {
5452 C(i ,j ) -= A(i ,k) * B(k,j );
5453 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5454 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5455 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5456 }
5457 }
5458 else
5459 {
5460 for( ; k<kend; ++k ) {
5461 C(i ,j ) -= A(i ,k) * B(k,j );
5462 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5463 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5464 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5465 }
5466 }
5467 }
5468
5469 if( j < jend )
5470 {
5471 const size_t kbegin( ( IsUpper_v<MT4> )
5472 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5473 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5474 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
5475
5476 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5477 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5478
5479 size_t k( kbegin );
5480
5481 if( k < kpos )
5482 {
5483 SIMDType b1( B.load(k,j) );
5484 SIMDType xmm1( A.load(i ,k) * b1 );
5485 SIMDType xmm2( A.load(i+1UL,k) * b1 );
5486
5487 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5488 b1 = B.load(k,j);
5489 xmm1 += A.load(i ,k) * b1;
5490 xmm2 += A.load(i+1UL,k) * b1;
5491 }
5492
5493 C(i ,j) -= sum( xmm1 );
5494 C(i+1UL,j) -= sum( xmm2 );
5495
5496 for( ; remainder && k<kend; ++k ) {
5497 C(i ,j) -= A(i ,k) * B(k,j);
5498 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5499 }
5500 }
5501 else
5502 {
5503 for( ; k<kend; ++k ) {
5504 C(i ,j) -= A(i ,k) * B(k,j);
5505 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5506 }
5507 }
5508 }
5509 }
5510
5511 for( ; i<M; ++i )
5512 {
5513 const size_t jend( LOW ? i+1UL : N );
5514 size_t j( UPP ? i : 0UL );
5515
5516 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
5517 {
5518 const size_t kbegin( ( IsUpper_v<MT4> )
5519 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5520 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5521 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
5522
5523 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5524 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5525
5526 size_t k( kbegin );
5527
5528 if( k < kpos )
5529 {
5530 SIMDType a1( A.load(i,k) );
5531 SIMDType xmm1( a1 * B.load(k,j ) );
5532 SIMDType xmm2( a1 * B.load(k,j+1UL) );
5533 SIMDType xmm3( a1 * B.load(k,j+2UL) );
5534 SIMDType xmm4( a1 * B.load(k,j+3UL) );
5535
5536 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5537 a1 = A.load(i,k);
5538 xmm1 += a1 * B.load(k,j );
5539 xmm2 += a1 * B.load(k,j+1UL);
5540 xmm3 += a1 * B.load(k,j+2UL);
5541 xmm4 += a1 * B.load(k,j+3UL);
5542 }
5543
5544 C(i,j ) -= sum( xmm1 );
5545 C(i,j+1UL) -= sum( xmm2 );
5546 C(i,j+2UL) -= sum( xmm3 );
5547 C(i,j+3UL) -= sum( xmm4 );
5548
5549 for( ; remainder && k<kend; ++k ) {
5550 C(i,j ) -= A(i,k) * B(k,j );
5551 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
5552 C(i,j+2UL) -= A(i,k) * B(k,j+2UL);
5553 C(i,j+3UL) -= A(i,k) * B(k,j+3UL);
5554 }
5555 }
5556 else
5557 {
5558 for( ; k<kend; ++k ) {
5559 C(i,j ) -= A(i,k) * B(k,j );
5560 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
5561 C(i,j+2UL) -= A(i,k) * B(k,j+2UL);
5562 C(i,j+3UL) -= A(i,k) * B(k,j+3UL);
5563 }
5564 }
5565 }
5566
5567 for( ; (j+2UL) <= jend; j+=2UL )
5568 {
5569 const size_t kbegin( ( IsUpper_v<MT4> )
5570 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5571 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5572 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
5573
5574 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5575 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5576
5577 size_t k( kbegin );
5578
5579 if( k < kpos )
5580 {
5581 SIMDType a1( A.load(i,k) );
5582 SIMDType xmm1( a1 * B.load(k,j ) );
5583 SIMDType xmm2( a1 * B.load(k,j+1UL) );
5584
5585 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5586 a1 = A.load(i,k);
5587 xmm1 += a1 * B.load(k,j );
5588 xmm2 += a1 * B.load(k,j+1UL);
5589 }
5590
5591 C(i,j ) -= sum( xmm1 );
5592 C(i,j+1UL) -= sum( xmm2 );
5593
5594 for( ; remainder && k<kend; ++k ) {
5595 C(i,j ) -= A(i,k) * B(k,j );
5596 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
5597 }
5598 }
5599 else
5600 {
5601 for( ; k<kend; ++k ) {
5602 C(i,j ) -= A(i,k) * B(k,j );
5603 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
5604 }
5605 }
5606 }
5607
5608 if( j < jend )
5609 {
5610 const size_t kbegin( ( IsUpper_v<MT4> )
5611 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5612 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5613
5614 const size_t kpos( remainder ? prevMultiple( K, SIMDSIZE ) : K );
5615 BLAZE_INTERNAL_ASSERT( kpos <= K, "Invalid end calculation" );
5616
5617 size_t k( kbegin );
5618
5619 if( k < kpos )
5620 {
5621 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
5622
5623 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5624 xmm1 += A.load(i,k) * B.load(k,j);
5625 }
5626
5627 C(i,j) -= sum( xmm1 );
5628
5629 for( ; remainder && k<K; ++k ) {
5630 C(i,j) -= A(i,k) * B(k,j);
5631 }
5632 }
5633 else
5634 {
5635 for( ; k<K; ++k ) {
5636 C(i,j) -= A(i,k) * B(k,j);
5637 }
5638 }
5639 }
5640 }
5641 }
5643 //**********************************************************************************************
5644
5645 //**Default subtraction assignment to column-major dense matrices (small matrices)**************
5660 template< typename MT3 // Type of the left-hand side target matrix
5661 , typename MT4 // Type of the left-hand side matrix operand
5662 , typename MT5 > // Type of the right-hand side matrix operand
5663 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5664 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5665 {
5666 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
5667
5668 const size_t M( A.rows() );
5669 const size_t N( B.columns() );
5670 const size_t K( A.columns() );
5671
5672 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5673
5674 size_t i( 0UL );
5675
5676 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
5677 {
5678 size_t j( 0UL );
5679
5680 for( ; (j+2UL) <= N; j+=2UL )
5681 {
5682 const size_t kbegin( ( IsUpper_v<MT4> )
5683 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5684 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5685 const size_t kend( ( IsLower_v<MT4> )
5686 ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
5687 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5688
5689 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5690 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5691
5692 size_t k( kbegin );
5693
5694 if( k < kpos )
5695 {
5696 SIMDType a1( A.load(i ,k) );
5697 SIMDType a2( A.load(i+1UL,k) );
5698 SIMDType a3( A.load(i+2UL,k) );
5699 SIMDType a4( A.load(i+3UL,k) );
5700 SIMDType b1( B.load(k,j ) );
5701 SIMDType b2( B.load(k,j+1UL) );
5702 SIMDType xmm1( a1 * b1 );
5703 SIMDType xmm2( a1 * b2 );
5704 SIMDType xmm3( a2 * b1 );
5705 SIMDType xmm4( a2 * b2 );
5706 SIMDType xmm5( a3 * b1 );
5707 SIMDType xmm6( a3 * b2 );
5708 SIMDType xmm7( a4 * b1 );
5709 SIMDType xmm8( a4 * b2 );
5710
5711 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5712 a1 = A.load(i ,k);
5713 a2 = A.load(i+1UL,k);
5714 a3 = A.load(i+2UL,k);
5715 a4 = A.load(i+3UL,k);
5716 b1 = B.load(k,j );
5717 b2 = B.load(k,j+1UL);
5718 xmm1 += a1 * b1;
5719 xmm2 += a1 * b2;
5720 xmm3 += a2 * b1;
5721 xmm4 += a2 * b2;
5722 xmm5 += a3 * b1;
5723 xmm6 += a3 * b2;
5724 xmm7 += a4 * b1;
5725 xmm8 += a4 * b2;
5726 }
5727
5728 C(i ,j ) -= sum( xmm1 );
5729 C(i ,j+1UL) -= sum( xmm2 );
5730 C(i+1UL,j ) -= sum( xmm3 );
5731 C(i+1UL,j+1UL) -= sum( xmm4 );
5732 C(i+2UL,j ) -= sum( xmm5 );
5733 C(i+2UL,j+1UL) -= sum( xmm6 );
5734 C(i+3UL,j ) -= sum( xmm7 );
5735 C(i+3UL,j+1UL) -= sum( xmm8 );
5736
5737 for( ; remainder && k<kend; ++k ) {
5738 C(i ,j ) -= A(i ,k) * B(k,j );
5739 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5740 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5741 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5742 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5743 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5744 C(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
5745 C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
5746 }
5747 }
5748 else
5749 {
5750 for( ; k<kend; ++k ) {
5751 C(i ,j ) -= A(i ,k) * B(k,j );
5752 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5753 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5754 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5755 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5756 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5757 C(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
5758 C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
5759 }
5760 }
5761 }
5762
5763 if( j < N )
5764 {
5765 const size_t kbegin( ( IsUpper_v<MT4> )
5766 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5767 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5768 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
5769
5770 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5771 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5772
5773 size_t k( kbegin );
5774
5775 if( k < kpos )
5776 {
5777 SIMDType b1( B.load(k,j) );
5778 SIMDType xmm1( A.load(i ,k) * b1 );
5779 SIMDType xmm2( A.load(i+1UL,k) * b1 );
5780 SIMDType xmm3( A.load(i+2UL,k) * b1 );
5781 SIMDType xmm4( A.load(i+3UL,k) * b1 );
5782
5783 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5784 b1 = B.load(k,j);
5785 xmm1 += A.load(i ,k) * b1;
5786 xmm2 += A.load(i+1UL,k) * b1;
5787 xmm3 += A.load(i+2UL,k) * b1;
5788 xmm4 += A.load(i+3UL,k) * b1;
5789 }
5790
5791 C(i ,j) -= sum( xmm1 );
5792 C(i+1UL,j) -= sum( xmm2 );
5793 C(i+2UL,j) -= sum( xmm3 );
5794 C(i+3UL,j) -= sum( xmm4 );
5795
5796 for( ; remainder && k<kend; ++k ) {
5797 C(i ,j) -= A(i ,k) * B(k,j);
5798 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5799 C(i+2UL,j) -= A(i+2UL,k) * B(k,j);
5800 C(i+3UL,j) -= A(i+3UL,k) * B(k,j);
5801 }
5802 }
5803 else
5804 {
5805 for( ; k<kend; ++k ) {
5806 C(i ,j) -= A(i ,k) * B(k,j);
5807 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5808 C(i+2UL,j) -= A(i+2UL,k) * B(k,j);
5809 C(i+3UL,j) -= A(i+3UL,k) * B(k,j);
5810 }
5811 }
5812 }
5813 }
5814
5815 for( ; !LOW && !UPP && (i+3UL) <= M; i+=3UL )
5816 {
5817 size_t j( 0UL );
5818
5819 for( ; (j+3UL) <= N; j+=3UL )
5820 {
5821 const size_t kbegin( ( IsUpper_v<MT4> )
5822 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5823 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5824 const size_t kend( ( IsLower_v<MT4> )
5825 ?( IsUpper_v<MT5> ? min( i+3UL, j+3UL ) : ( i+3UL ) )
5826 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
5827
5828 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5829 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5830
5831 size_t k( kbegin );
5832
5833 if( k < kpos )
5834 {
5835 SIMDType a1( A.load(i ,k) );
5836 SIMDType a2( A.load(i+1UL,k) );
5837 SIMDType a3( A.load(i+2UL,k) );
5838 SIMDType b1( B.load(k,j ) );
5839 SIMDType b2( B.load(k,j+1UL) );
5840 SIMDType b3( B.load(k,j+2UL) );
5841 SIMDType xmm1( a1 * b1 );
5842 SIMDType xmm2( a1 * b2 );
5843 SIMDType xmm3( a1 * b3 );
5844 SIMDType xmm4( a2 * b1 );
5845 SIMDType xmm5( a2 * b2 );
5846 SIMDType xmm6( a2 * b3 );
5847 SIMDType xmm7( a3 * b1 );
5848 SIMDType xmm8( a3 * b2 );
5849 SIMDType xmm9( a3 * b3 );
5850
5851 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5852 a1 = A.load(i ,k);
5853 a2 = A.load(i+1UL,k);
5854 a3 = A.load(i+2UL,k);
5855 b1 = B.load(k,j );
5856 b2 = B.load(k,j+1UL);
5857 b3 = B.load(k,j+2UL);
5858 xmm1 += a1 * b1;
5859 xmm2 += a1 * b2;
5860 xmm3 += a1 * b3;
5861 xmm4 += a2 * b1;
5862 xmm5 += a2 * b2;
5863 xmm6 += a2 * b3;
5864 xmm7 += a3 * b1;
5865 xmm8 += a3 * b2;
5866 xmm9 += a3 * b3;
5867 }
5868
5869 C(i ,j ) -= sum( xmm1 );
5870 C(i ,j+1UL) -= sum( xmm2 );
5871 C(i ,j+2UL) -= sum( xmm3 );
5872 C(i+1UL,j ) -= sum( xmm4 );
5873 C(i+1UL,j+1UL) -= sum( xmm5 );
5874 C(i+1UL,j+2UL) -= sum( xmm6 );
5875 C(i+2UL,j ) -= sum( xmm7 );
5876 C(i+2UL,j+1UL) -= sum( xmm8 );
5877 C(i+2UL,j+2UL) -= sum( xmm9 );
5878
5879 for( ; remainder && k<kend; ++k ) {
5880 C(i ,j ) -= A(i ,k) * B(k,j );
5881 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5882 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
5883 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5884 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5885 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
5886 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5887 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5888 C(i+2UL,j+2UL) -= A(i+2UL,k) * B(k,j+2UL);
5889 }
5890 }
5891 else
5892 {
5893 for( ; k<kend; ++k ) {
5894 C(i ,j ) -= A(i ,k) * B(k,j );
5895 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5896 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
5897 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5898 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5899 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
5900 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5901 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5902 C(i+2UL,j+2UL) -= A(i+2UL,k) * B(k,j+2UL);
5903 }
5904 }
5905 }
5906
5907 for( ; (j+2UL) <= N; j+=2UL )
5908 {
5909 const size_t kbegin( ( IsUpper_v<MT4> )
5910 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5911 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5912 const size_t kend( ( IsLower_v<MT4> )
5913 ?( IsUpper_v<MT5> ? min( i+3UL, j+2UL ) : ( i+3UL ) )
5914 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5915
5916 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5917 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5918
5919 size_t k( kbegin );
5920
5921 if( k < kpos )
5922 {
5923 SIMDType a1( A.load(i ,k) );
5924 SIMDType a2( A.load(i+1UL,k) );
5925 SIMDType a3( A.load(i+2UL,k) );
5926 SIMDType b1( B.load(k,j ) );
5927 SIMDType b2( B.load(k,j+1UL) );
5928 SIMDType xmm1( a1 * b1 );
5929 SIMDType xmm2( a1 * b2 );
5930 SIMDType xmm3( a2 * b1 );
5931 SIMDType xmm4( a2 * b2 );
5932 SIMDType xmm5( a3 * b1 );
5933 SIMDType xmm6( a3 * b2 );
5934
5935 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5936 a1 = A.load(i ,k);
5937 a2 = A.load(i+1UL,k);
5938 a3 = A.load(i+2UL,k);
5939 b1 = B.load(k,j );
5940 b2 = B.load(k,j+1UL);
5941 xmm1 += a1 * b1;
5942 xmm2 += a1 * b2;
5943 xmm3 += a2 * b1;
5944 xmm4 += a2 * b2;
5945 xmm5 += a3 * b1;
5946 xmm6 += a3 * b2;
5947 }
5948
5949 C(i ,j ) -= sum( xmm1 );
5950 C(i ,j+1UL) -= sum( xmm2 );
5951 C(i+1UL,j ) -= sum( xmm3 );
5952 C(i+1UL,j+1UL) -= sum( xmm4 );
5953 C(i+2UL,j ) -= sum( xmm5 );
5954 C(i+2UL,j+1UL) -= sum( xmm6 );
5955
5956 for( ; remainder && k<kend; ++k ) {
5957 C(i ,j ) -= A(i ,k) * B(k,j );
5958 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5959 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5960 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5961 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5962 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5963 }
5964 }
5965 else
5966 {
5967 for( ; k<kend; ++k ) {
5968 C(i ,j ) -= A(i ,k) * B(k,j );
5969 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5970 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5971 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5972 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5973 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5974 }
5975 }
5976 }
5977
5978 if( j < N )
5979 {
5980 const size_t kbegin( ( IsUpper_v<MT4> )
5981 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
5982 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
5983 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
5984
5985 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
5986 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
5987
5988 size_t k( kbegin );
5989
5990 if( k < kpos )
5991 {
5992 SIMDType b1( B.load(k,j) );
5993 SIMDType xmm1( A.load(i ,k) * b1 );
5994 SIMDType xmm2( A.load(i+1UL,k) * b1 );
5995 SIMDType xmm3( A.load(i+2UL,k) * b1 );
5996
5997 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
5998 b1 = B.load(k,j);
5999 xmm1 += A.load(i ,k) * b1;
6000 xmm2 += A.load(i+1UL,k) * b1;
6001 xmm3 += A.load(i+2UL,k) * b1;
6002 }
6003
6004 C(i ,j) -= sum( xmm1 );
6005 C(i+1UL,j) -= sum( xmm2 );
6006 C(i+2UL,j) -= sum( xmm3 );
6007
6008 for( ; remainder && k<kend; ++k ) {
6009 C(i ,j) -= A(i ,k) * B(k,j);
6010 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
6011 C(i+2UL,j) -= A(i+2UL,k) * B(k,j);
6012 }
6013 }
6014 else
6015 {
6016 for( ; k<kend; ++k ) {
6017 C(i ,j) -= A(i ,k) * B(k,j);
6018 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
6019 C(i+2UL,j) -= A(i+2UL,k) * B(k,j);
6020 }
6021 }
6022 }
6023 }
6024
6025 for( ; (i+2UL) <= M; i+=2UL )
6026 {
6027 const size_t jend( LOW ? i+2UL : N );
6028 size_t j( UPP ? i : 0UL );
6029
6030 for( ; (j+2UL) <= jend; j+=2UL )
6031 {
6032 const size_t kbegin( ( IsUpper_v<MT4> )
6033 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
6034 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
6035 const size_t kend( ( IsLower_v<MT4> )
6036 ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
6037 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6038
6039 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
6040 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
6041
6042 size_t k( kbegin );
6043
6044 if( k < kpos )
6045 {
6046 SIMDType a1( A.load(i ,k) );
6047 SIMDType a2( A.load(i+1UL,k) );
6048 SIMDType b1( B.load(k,j ) );
6049 SIMDType b2( B.load(k,j+1UL) );
6050 SIMDType xmm1( a1 * b1 );
6051 SIMDType xmm2( a1 * b2 );
6052 SIMDType xmm3( a2 * b1 );
6053 SIMDType xmm4( a2 * b2 );
6054
6055 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
6056 a1 = A.load(i ,k);
6057 a2 = A.load(i+1UL,k);
6058 b1 = B.load(k,j );
6059 b2 = B.load(k,j+1UL);
6060 xmm1 += a1 * b1;
6061 xmm2 += a1 * b2;
6062 xmm3 += a2 * b1;
6063 xmm4 += a2 * b2;
6064 }
6065
6066 C(i ,j ) -= sum( xmm1 );
6067 C(i ,j+1UL) -= sum( xmm2 );
6068 C(i+1UL,j ) -= sum( xmm3 );
6069 C(i+1UL,j+1UL) -= sum( xmm4 );
6070
6071 for( ; remainder && k<kend; ++k ) {
6072 C(i ,j ) -= A(i ,k) * B(k,j );
6073 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
6074 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
6075 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
6076 }
6077 }
6078 else
6079 {
6080 for( ; k<kend; ++k ) {
6081 C(i ,j ) -= A(i ,k) * B(k,j );
6082 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
6083 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
6084 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
6085 }
6086 }
6087 }
6088
6089 if( j < jend )
6090 {
6091 const size_t kbegin( ( IsUpper_v<MT4> )
6092 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
6093 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
6094 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
6095
6096 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
6097 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
6098
6099 size_t k( kbegin );
6100
6101 if( k < kpos )
6102 {
6103 SIMDType b1( B.load(k,j) );
6104 SIMDType xmm1( A.load(i ,k) * b1 );
6105 SIMDType xmm2( A.load(i+1UL,k) * b1 );
6106
6107 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
6108 b1 = B.load(k,j);
6109 xmm1 += A.load(i ,k) * b1;
6110 xmm2 += A.load(i+1UL,k) * b1;
6111 }
6112
6113 C(i ,j) -= sum( xmm1 );
6114 C(i+1UL,j) -= sum( xmm2 );
6115
6116 for( ; remainder && k<kend; ++k ) {
6117 C(i ,j) -= A(i ,k) * B(k,j);
6118 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
6119 }
6120 }
6121 else
6122 {
6123 for( ; k<kend; ++k ) {
6124 C(i ,j) -= A(i ,k) * B(k,j);
6125 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
6126 }
6127 }
6128 }
6129 }
6130
6131 if( i < M )
6132 {
6133 const size_t jend( LOW ? i+1UL : N );
6134 size_t j( UPP ? i : 0UL );
6135
6136 for( ; (j+2UL) <= jend; j+=2UL )
6137 {
6138 const size_t kbegin( ( IsUpper_v<MT4> )
6139 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
6140 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
6141 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
6142
6143 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
6144 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
6145
6146 size_t k( kbegin );
6147
6148 if( k < kpos )
6149 {
6150 SIMDType a1( A.load(i,k) );
6151 SIMDType xmm1( a1 * B.load(k,j ) );
6152 SIMDType xmm2( a1 * B.load(k,j+1UL) );
6153
6154 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
6155 a1 = A.load(i,k);
6156 xmm1 += a1 * B.load(k,j );
6157 xmm2 += a1 * B.load(k,j+1UL);
6158 }
6159
6160 C(i,j ) -= sum( xmm1 );
6161 C(i,j+1UL) -= sum( xmm2 );
6162
6163 for( ; remainder && k<kend; ++k ) {
6164 C(i,j ) -= A(i,k) * B(k,j );
6165 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
6166 }
6167 }
6168 else
6169 {
6170 for( ; k<kend; ++k ) {
6171 C(i,j ) -= A(i,k) * B(k,j );
6172 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
6173 }
6174 }
6175 }
6176
6177 if( j < jend )
6178 {
6179 const size_t kbegin( ( IsUpper_v<MT4> )
6180 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
6181 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
6182
6183 const size_t kpos( remainder ? prevMultiple( K, SIMDSIZE ) : K );
6184 BLAZE_INTERNAL_ASSERT( kpos <= K, "Invalid end calculation" );
6185
6186 size_t k( kbegin );
6187
6188 if( k < kpos )
6189 {
6190 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
6191
6192 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
6193 xmm1 += A.load(i,k) * B.load(k,j);
6194 }
6195
6196 C(i,j) -= sum( xmm1 );
6197
6198 for( ; remainder && k<K; ++k ) {
6199 C(i,j) -= A(i,k) * B(k,j);
6200 }
6201 }
6202 else
6203 {
6204 for( ; k<K; ++k ) {
6205 C(i,j) -= A(i,k) * B(k,j);
6206 }
6207 }
6208 }
6209 }
6210 }
6212 //**********************************************************************************************
6213
6214 //**Default subtraction assignment to dense matrices (large matrices)***************************
6228 template< typename MT3 // Type of the left-hand side target matrix
6229 , typename MT4 // Type of the left-hand side matrix operand
6230 , typename MT5 > // Type of the right-hand side matrix operand
6231 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6232 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6233 {
6234 selectDefaultSubAssignKernel( C, A, B );
6235 }
6237 //**********************************************************************************************
6238
6239 //**Default subtraction assignment to dense matrices (large matrices)***************************
6254 template< typename MT3 // Type of the left-hand side target matrix
6255 , typename MT4 // Type of the left-hand side matrix operand
6256 , typename MT5 > // Type of the right-hand side matrix operand
6257 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6258 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6259 {
6260 if( LOW )
6261 lmmm( C, A, B, ElementType(-1), ElementType(1) );
6262 else if( UPP )
6263 ummm( C, A, B, ElementType(-1), ElementType(1) );
6264 else
6265 mmm( C, A, B, ElementType(-1), ElementType(1) );
6266 }
6268 //**********************************************************************************************
6269
6270 //**BLAS-based subtraction assignment to dense matrices (default)*******************************
6284 template< typename MT3 // Type of the left-hand side target matrix
6285 , typename MT4 // Type of the left-hand side matrix operand
6286 , typename MT5 > // Type of the right-hand side matrix operand
6287 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6288 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6289 {
6290 selectLargeSubAssignKernel( C, A, B );
6291 }
6293 //**********************************************************************************************
6294
6295 //**BLAS-based subraction assignment to dense matrices******************************************
6296#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6310 template< typename MT3 // Type of the left-hand side target matrix
6311 , typename MT4 // Type of the left-hand side matrix operand
6312 , typename MT5 > // Type of the right-hand side matrix operand
6313 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6314 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6315 {
6316 using ET = ElementType_t<MT3>;
6317
6318 if( IsTriangular_v<MT4> ) {
6319 ResultType_t<MT3> tmp( serial( B ) );
6320 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
6321 subAssign( C, tmp );
6322 }
6323 else if( IsTriangular_v<MT5> ) {
6324 ResultType_t<MT3> tmp( serial( A ) );
6325 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
6326 subAssign( C, tmp );
6327 }
6328 else {
6329 gemm( C, A, B, ET(-1), ET(1) );
6330 }
6331 }
6333#endif
6334 //**********************************************************************************************
6335
6336 //**Subtraction assignment to sparse matrices***************************************************
6337 // No special implementation for the subtraction assignment to sparse matrices.
6338 //**********************************************************************************************
6339
6340 //**Schur product assignment to dense matrices**************************************************
6353 template< typename MT // Type of the target dense matrix
6354 , bool SO > // Storage order of the target dense matrix
6355 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
6356 {
6358
6362
6363 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6364 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6365
6366 const ResultType tmp( serial( rhs ) );
6367 schurAssign( *lhs, tmp );
6368 }
6370 //**********************************************************************************************
6371
6372 //**Schur product assignment to sparse matrices*************************************************
6373 // No special implementation for the Schur product assignment to sparse matrices.
6374 //**********************************************************************************************
6375
6376 //**Multiplication assignment to dense matrices*************************************************
6377 // No special implementation for the multiplication assignment to dense matrices.
6378 //**********************************************************************************************
6379
6380 //**Multiplication assignment to sparse matrices************************************************
6381 // No special implementation for the multiplication assignment to sparse matrices.
6382 //**********************************************************************************************
6383
6384 //**SMP assignment to dense matrices************************************************************
6399 template< typename MT // Type of the target dense matrix
6400 , bool SO > // Storage order of the target dense matrix
6401 friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
6402 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6403 {
6405
6406 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6407 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6408
6409 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
6410 return;
6411 }
6412 else if( rhs.lhs_.columns() == 0UL ) {
6413 reset( *lhs );
6414 return;
6415 }
6416
6417 LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6418 RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6419
6420 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6421 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6422 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6423 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6424 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
6425 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
6426
6427 smpAssign( *lhs, A * B );
6428 }
6430 //**********************************************************************************************
6431
6432 //**SMP assignment to sparse matrices***********************************************************
6447 template< typename MT // Type of the target sparse matrix
6448 , bool SO > // Storage order of the target sparse matrix
6449 friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
6450 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6451 {
6453
6454 using TmpType = If_t< SO, OppositeType, ResultType >;
6455
6462
6463 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6464 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6465
6466 const ForwardFunctor fwd;
6467
6468 const TmpType tmp( rhs );
6469 smpAssign( *lhs, fwd( tmp ) );
6470 }
6472 //**********************************************************************************************
6473
6474 //**SMP addition assignment to dense matrices***************************************************
6490 template< typename MT // Type of the target dense matrix
6491 , bool SO > // Storage order of the target dense matrix
6492 friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
6493 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6494 {
6496
6497 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6498 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6499
6500 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6501 return;
6502 }
6503
6504 LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6505 RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6506
6507 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6508 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6509 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6510 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6511 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
6512 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
6513
6514 smpAddAssign( *lhs, A * B );
6515 }
6517 //**********************************************************************************************
6518
6519 //**SMP addition assignment to sparse matrices**************************************************
6520 // No special implementation for the SMP addition assignment to sparse matrices.
6521 //**********************************************************************************************
6522
6523 //**SMP subtraction assignment to dense matrices************************************************
6539 template< typename MT // Type of the target dense matrix
6540 , bool SO > // Storage order of the target dense matrix
6541 friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
6542 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6543 {
6545
6546 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6547 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6548
6549 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6550 return;
6551 }
6552
6553 LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6554 RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6555
6556 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6557 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6558 BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6559 BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6560 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
6561 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns() , "Invalid number of columns" );
6562
6563 smpSubAssign( *lhs, A * B );
6564 }
6566 //**********************************************************************************************
6567
6568 //**SMP subtraction assignment to sparse matrices***********************************************
6569 // No special implementation for the SMP subtraction assignment to sparse matrices.
6570 //**********************************************************************************************
6571
6572 //**SMP Schur product assignment to dense matrices**********************************************
6585 template< typename MT // Type of the target dense matrix
6586 , bool SO > // Storage order of the target dense matrix
6587 friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
6588 {
6590
6594
6595 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6596 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6597
6598 const ResultType tmp( rhs );
6599 smpSchurAssign( *lhs, tmp );
6600 }
6602 //**********************************************************************************************
6603
6604 //**SMP Schur product assignment to sparse matrices*********************************************
6605 // No special implementation for the SMP Schur product assignment to sparse matrices.
6606 //**********************************************************************************************
6607
6608 //**SMP multiplication assignment to dense matrices*********************************************
6609 // No special implementation for the SMP multiplication assignment to dense matrices.
6610 //**********************************************************************************************
6611
6612 //**SMP multiplication assignment to sparse matrices********************************************
6613 // No special implementation for the SMP multiplication assignment to sparse matrices.
6614 //**********************************************************************************************
6615
6616 //**Compile time checks*************************************************************************
6624 //**********************************************************************************************
6625};
6626//*************************************************************************************************
6627
6628
6629
6630
6631//=================================================================================================
6632//
6633// DMATSCALARMULTEXPR SPECIALIZATION
6634//
6635//=================================================================================================
6636
6637//*************************************************************************************************
6645template< typename MT1 // Type of the left-hand side dense matrix
6646 , typename MT2 // Type of the right-hand side dense matrix
6647 , bool SF // Symmetry flag
6648 , bool HF // Hermitian flag
6649 , bool LF // Lower flag
6650 , bool UF // Upper flag
6651 , typename ST > // Type of the right-hand side scalar value
6652class DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
6653 : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
6654 , private Computation
6655{
6656 private:
6657 //**Type definitions****************************************************************************
6659 using MMM = DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
6660
6661 using RES = ResultType_t<MMM>;
6662 using RT1 = ResultType_t<MT1>;
6663 using RT2 = ResultType_t<MT2>;
6664 using ET1 = ElementType_t<RT1>;
6665 using ET2 = ElementType_t<RT2>;
6666 using CT1 = CompositeType_t<MT1>;
6667 using CT2 = CompositeType_t<MT2>;
6668 //**********************************************************************************************
6669
6670 //**********************************************************************************************
6672 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
6673 //**********************************************************************************************
6674
6675 //**********************************************************************************************
6677 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
6678 //**********************************************************************************************
6679
6680 //**********************************************************************************************
6681 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
6682 static constexpr bool HERM = ( HF && !( LF || UF ) );
6683 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
6684 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
6685 //**********************************************************************************************
6686
6687 //**********************************************************************************************
6689
6692 template< typename T1, typename T2, typename T3 >
6693 static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
6694 //**********************************************************************************************
6695
6696 //**********************************************************************************************
6698
6700 template< typename T1, typename T2, typename T3, typename T4 >
6701 static constexpr bool UseBlasKernel_v =
6702 ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
6703 !SYM && !HERM && !LOW && !UPP &&
6704 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
6705 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
6706 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
6707 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
6708 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6709 IsBLASCompatible_v< ElementType_t<T1> > &&
6710 IsBLASCompatible_v< ElementType_t<T2> > &&
6711 IsBLASCompatible_v< ElementType_t<T3> > &&
6712 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
6713 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
6714 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
6715 //**********************************************************************************************
6716
6717 //**********************************************************************************************
6719
6721 template< typename T1, typename T2, typename T3, typename T4 >
6722 static constexpr bool UseVectorizedDefaultKernel_v =
6723 ( useOptimizedKernels &&
6724 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
6725 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6726 IsSIMDCombinable_v< ElementType_t<T1>
6727 , ElementType_t<T2>
6728 , ElementType_t<T3>
6729 , T4 > &&
6730 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
6731 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
6732 //**********************************************************************************************
6733
6734 //**********************************************************************************************
6736
6738 using ForwardFunctor = If_t< HERM
6739 , DeclHerm
6740 , If_t< SYM
6741 , DeclSym
6742 , If_t< LOW
6743 , If_t< UPP
6744 , DeclDiag
6745 , DeclLow >
6746 , If_t< UPP
6747 , DeclUpp
6748 , Noop > > > >;
6749 //**********************************************************************************************
6750
6751 public:
6752 //**Type definitions****************************************************************************
6754 using This = DMatScalarMultExpr<MMM,ST,false>;
6755
6757 using BaseType = MatScalarMultExpr< DenseMatrix<This,false> >;
6758
6760 using ResultType = typename If_t< HERM
6761 , DeclHermTrait< MultTrait_t<RES,ST> >
6762 , If_t< SYM
6763 , DeclSymTrait< MultTrait_t<RES,ST> >
6764 , If_t< LOW
6765 , If_t< UPP
6766 , DeclDiagTrait< MultTrait_t<RES,ST> >
6767 , DeclLowTrait< MultTrait_t<RES,ST> > >
6768 , If_t< UPP
6769 , DeclUppTrait< MultTrait_t<RES,ST> >
6770 , MultTrait<RES,ST> > > > >::Type;
6771
6772 using OppositeType = OppositeType_t<ResultType>;
6773 using TransposeType = TransposeType_t<ResultType>;
6774 using ElementType = ElementType_t<ResultType>;
6775 using SIMDType = SIMDTrait_t<ElementType>;
6776 using ReturnType = const ElementType;
6777 using CompositeType = const ResultType;
6778
6780 using LeftOperand = const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
6781
6783 using RightOperand = ST;
6784
6786 using LT = If_t< evaluateLeft, const RT1, CT1 >;
6787
6789 using RT = If_t< evaluateRight, const RT2, CT2 >;
6790 //**********************************************************************************************
6791
6792 //**Compilation flags***************************************************************************
6794 static constexpr bool simdEnabled =
6795 ( !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2> &&
6796 MT1::simdEnabled && MT2::simdEnabled &&
6797 IsSIMDCombinable_v<ET1,ET2,ST> &&
6798 HasSIMDAdd_v<ET1,ET2> &&
6799 HasSIMDMult_v<ET1,ET2> );
6800
6802 static constexpr bool smpAssignable =
6803 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
6804 //**********************************************************************************************
6805
6806 //**SIMD properties*****************************************************************************
6808 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
6809 //**********************************************************************************************
6810
6811 //**Constructor*********************************************************************************
6817 inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
6818 : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
6819 , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
6820 {}
6821 //**********************************************************************************************
6822
6823 //**Access operator*****************************************************************************
6830 inline ReturnType operator()( size_t i, size_t j ) const {
6831 BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
6832 BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
6833 return matrix_(i,j) * scalar_;
6834 }
6835 //**********************************************************************************************
6836
6837 //**At function*********************************************************************************
6845 inline ReturnType at( size_t i, size_t j ) const {
6846 if( i >= matrix_.rows() ) {
6847 BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
6848 }
6849 if( j >= matrix_.columns() ) {
6850 BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
6851 }
6852 return (*this)(i,j);
6853 }
6854 //**********************************************************************************************
6855
6856 //**Rows function*******************************************************************************
6861 inline size_t rows() const {
6862 return matrix_.rows();
6863 }
6864 //**********************************************************************************************
6865
6866 //**Columns function****************************************************************************
6871 inline size_t columns() const {
6872 return matrix_.columns();
6873 }
6874 //**********************************************************************************************
6875
6876 //**Left operand access*************************************************************************
6881 inline LeftOperand leftOperand() const {
6882 return matrix_;
6883 }
6884 //**********************************************************************************************
6885
6886 //**Right operand access************************************************************************
6891 inline RightOperand rightOperand() const {
6892 return scalar_;
6893 }
6894 //**********************************************************************************************
6895
6896 //**********************************************************************************************
6902 template< typename T >
6903 inline bool canAlias( const T* alias ) const {
6904 return matrix_.canAlias( alias );
6905 }
6906 //**********************************************************************************************
6907
6908 //**********************************************************************************************
6914 template< typename T >
6915 inline bool isAliased( const T* alias ) const {
6916 return matrix_.isAliased( alias );
6917 }
6918 //**********************************************************************************************
6919
6920 //**********************************************************************************************
6925 inline bool isAligned() const {
6926 return matrix_.isAligned();
6927 }
6928 //**********************************************************************************************
6929
6930 //**********************************************************************************************
6935 inline bool canSMPAssign() const noexcept {
6936 return ( !BLAZE_BLAS_MODE ||
6937 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
6939 ( rows() * columns() < DMATTDMATMULT_THRESHOLD ) ) &&
6940 ( rows() * columns() >= SMP_DMATTDMATMULT_THRESHOLD );
6941 }
6942 //**********************************************************************************************
6943
6944 private:
6945 //**Member variables****************************************************************************
6948 //**********************************************************************************************
6949
6950 //**Assignment to dense matrices****************************************************************
6962 template< typename MT // Type of the target dense matrix
6963 , bool SO > // Storage order of the target dense matrix
6964 friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6965 {
6967
6968 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
6969 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
6970
6971 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6972 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6973
6974 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
6975 return;
6976 }
6977 else if( left.columns() == 0UL ) {
6978 reset( *lhs );
6979 return;
6980 }
6981
6982 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6983 RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6984
6985 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6986 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6987 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6988 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6989 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
6990 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
6991
6992 DMatScalarMultExpr::selectAssignKernel( *lhs, A, B, rhs.scalar_ );
6993 }
6994 //**********************************************************************************************
6995
6996 //**Assignment to dense matrices (kernel selection)*********************************************
7007 template< typename MT3 // Type of the left-hand side target matrix
7008 , typename MT4 // Type of the left-hand side matrix operand
7009 , typename MT5 // Type of the right-hand side matrix operand
7010 , typename ST2 > // Type of the scalar value
7011 static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7012 {
7013 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
7014 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
7015 selectSmallAssignKernel( C, A, B, scalar );
7016 else
7017 selectBlasAssignKernel( C, A, B, scalar );
7018 }
7019 //**********************************************************************************************
7020
7021 //**Default assignment to row-major dense matrices (general/general)****************************
7035 template< typename MT3 // Type of the left-hand side target matrix
7036 , typename MT4 // Type of the left-hand side matrix operand
7037 , typename MT5 // Type of the right-hand side matrix operand
7038 , typename ST2 > // Type of the scalar value
7039 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7040 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7041 {
7042 const size_t M( A.rows() );
7043 const size_t N( B.columns() );
7044 const size_t K( A.columns() );
7045
7046 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7047
7048 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
7049 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
7050 :( 0UL ) );
7051 const size_t iend( ( IsStrictlyUpper_v<MT4> )
7052 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
7053 :( M ) );
7054 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7055
7056 for( size_t i=0UL; i<ibegin; ++i ) {
7057 for( size_t j=0UL; j<N; ++j ) {
7058 reset( C(i,j) );
7059 }
7060 }
7061 for( size_t i=ibegin; i<iend; ++i )
7062 {
7063 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
7064 ?( ( IsStrictlyUpper_v<MT4> )
7065 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
7066 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
7067 :( ( IsStrictlyUpper_v<MT5> )
7068 ?( SYM || HERM || UPP ? max( i, 1UL ) : 1UL )
7069 :( SYM || HERM || UPP ? i : 0UL ) ) );
7070 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
7071 ?( ( IsStrictlyLower_v<MT4> )
7072 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
7073 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
7074 :( ( IsStrictlyLower_v<MT5> )
7075 ?( LOW ? min(i+1UL,N-1UL) : N-1UL )
7076 :( LOW ? i+1UL : N ) ) );
7077
7078 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
7079 for( size_t j=0UL; j<N; ++j ) {
7080 reset( C(i,j) );
7081 }
7082 continue;
7083 }
7084
7085 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7086
7087 for( size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
7088 reset( C(i,j) );
7089 }
7090 for( size_t j=jbegin; j<jend; ++j )
7091 {
7092 const size_t kbegin( ( IsUpper_v<MT4> )
7093 ?( ( IsLower_v<MT5> )
7094 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7095 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7096 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7097 :( ( IsLower_v<MT5> )
7098 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7099 :( 0UL ) ) );
7100 const size_t kend( ( IsLower_v<MT4> )
7101 ?( ( IsUpper_v<MT5> )
7102 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
7103 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
7104 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
7105 :( ( IsUpper_v<MT5> )
7106 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7107 :( K ) ) );
7108 BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
7109
7110 C(i,j) = A(i,kbegin) * B(kbegin,j);
7111 for( size_t k=kbegin+1UL; k<kend; ++k ) {
7112 C(i,j) += A(i,k) * B(k,j);
7113 }
7114 C(i,j) *= scalar;
7115 }
7116 for( size_t j=jend; j<N; ++j ) {
7117 reset( C(i,j) );
7118 }
7119 }
7120 for( size_t i=iend; i<M; ++i ) {
7121 for( size_t j=0UL; j<N; ++j ) {
7122 reset( C(i,j) );
7123 }
7124 }
7125
7126 if( SYM || HERM ) {
7127 for( size_t i=1UL; i<M; ++i ) {
7128 for( size_t j=0UL; j<i; ++j ) {
7129 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
7130 }
7131 }
7132 }
7133 }
7134 //**********************************************************************************************
7135
7136 //**Default assignment to column-major dense matrices (general/general)*************************
7150 template< typename MT3 // Type of the left-hand side target matrix
7151 , typename MT4 // Type of the left-hand side matrix operand
7152 , typename MT5 // Type of the right-hand side matrix operand
7153 , typename ST2 > // Type of the scalar value
7154 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7155 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7156 {
7157 const size_t M( A.rows() );
7158 const size_t N( B.columns() );
7159 const size_t K( A.columns() );
7160
7161 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7162
7163 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
7164 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
7165 :( 0UL ) );
7166 const size_t jend( ( IsStrictlyLower_v<MT5> )
7167 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
7168 :( N ) );
7169 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7170
7171 for( size_t j=0UL; j<jbegin; ++j ) {
7172 for( size_t i=0UL; i<M; ++i ) {
7173 reset( C(i,j) );
7174 }
7175 }
7176 for( size_t j=jbegin; j<jend; ++j )
7177 {
7178 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
7179 ?( ( IsStrictlyLower_v<MT4> )
7180 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
7181 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7182 :( ( IsStrictlyLower_v<MT4> )
7183 ?( SYM || HERM || LOW ? max( j, 1UL ) : 1UL )
7184 :( SYM || HERM || LOW ? j : 0UL ) ) );
7185 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
7186 ?( ( IsStrictlyUpper_v<MT4> )
7187 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
7188 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
7189 :( ( IsStrictlyUpper_v<MT4> )
7190 ?( UPP ? min(j+1UL,M-1UL) : M-1UL )
7191 :( UPP ? j+1UL : M ) ) );
7192
7193 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
7194 for( size_t i=0UL; i<M; ++i ) {
7195 reset( C(i,j) );
7196 }
7197 continue;
7198 }
7199
7200 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7201
7202 for( size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
7203 reset( C(i,j) );
7204 }
7205 for( size_t i=ibegin; i<iend; ++i )
7206 {
7207 const size_t kbegin( ( IsUpper_v<MT4> )
7208 ?( ( IsLower_v<MT5> )
7209 ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7210 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7211 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7212 :( ( IsLower_v<MT5> )
7213 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7214 :( 0UL ) ) );
7215 const size_t kend( ( IsLower_v<MT4> )
7216 ?( ( IsUpper_v<MT5> )
7217 ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
7218 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
7219 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
7220 :( ( IsUpper_v<MT5> )
7221 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7222 :( K ) ) );
7223 BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
7224
7225 C(i,j) = A(i,kbegin) * B(kbegin,j);
7226 for( size_t k=kbegin+1UL; k<kend; ++k ) {
7227 C(i,j) += A(i,k) * B(k,j);
7228 }
7229 C(i,j) *= scalar;
7230 }
7231 for( size_t i=iend; i<M; ++i ) {
7232 reset( C(i,j) );
7233 }
7234 }
7235 for( size_t j=jend; j<N; ++j ) {
7236 for( size_t i=0UL; i<M; ++i ) {
7237 reset( C(i,j) );
7238 }
7239 }
7240
7241 if( SYM || HERM ) {
7242 for( size_t j=1UL; j<N; ++j ) {
7243 for( size_t i=0UL; i<j; ++i ) {
7244 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
7245 }
7246 }
7247 }
7248 }
7249 //**********************************************************************************************
7250
7251 //**Default assignment to row-major dense matrices (general/diagonal)***************************
7265 template< typename MT3 // Type of the left-hand side target matrix
7266 , typename MT4 // Type of the left-hand side matrix operand
7267 , typename MT5 // Type of the right-hand side matrix operand
7268 , typename ST2 > // Type of the scalar value
7269 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7270 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7271 {
7272 const size_t M( A.rows() );
7273 const size_t N( B.columns() );
7274
7275 for( size_t i=0UL; i<M; ++i )
7276 {
7277 const size_t jbegin( ( IsUpper_v<MT4> )
7278 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7279 :( 0UL ) );
7280 const size_t jend( ( IsLower_v<MT4> )
7281 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7282 :( N ) );
7283 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7284
7285 if( IsUpper_v<MT4> ) {
7286 for( size_t j=0UL; j<jbegin; ++j ) {
7287 reset( C(i,j) );
7288 }
7289 }
7290 for( size_t j=jbegin; j<jend; ++j ) {
7291 C(i,j) = A(i,j) * B(j,j) * scalar;
7292 }
7293 if( IsLower_v<MT4> ) {
7294 for( size_t j=jend; j<N; ++j ) {
7295 reset( C(i,j) );
7296 }
7297 }
7298 }
7299 }
7300 //**********************************************************************************************
7301
7302 //**Default assignment to column-major dense matrices (general/diagonal)************************
7316 template< typename MT3 // Type of the left-hand side target matrix
7317 , typename MT4 // Type of the left-hand side matrix operand
7318 , typename MT5 // Type of the right-hand side matrix operand
7319 , typename ST2 > // Type of the scalar value
7320 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7321 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7322 {
7323 constexpr size_t block( BLOCK_SIZE );
7324
7325 const size_t M( A.rows() );
7326 const size_t N( B.columns() );
7327
7328 for( size_t jj=0UL; jj<N; jj+=block ) {
7329 const size_t jend( min( N, jj+block ) );
7330 for( size_t ii=0UL; ii<M; ii+=block ) {
7331 const size_t iend( min( M, ii+block ) );
7332 for( size_t j=jj; j<jend; ++j )
7333 {
7334 const size_t ibegin( ( IsLower_v<MT4> )
7335 ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
7336 :( ii ) );
7337 const size_t ipos( ( IsUpper_v<MT4> )
7338 ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
7339 :( iend ) );
7340
7341 if( IsLower_v<MT4> ) {
7342 for( size_t i=ii; i<ibegin; ++i ) {
7343 reset( C(i,j) );
7344 }
7345 }
7346 for( size_t i=ibegin; i<ipos; ++i ) {
7347 C(i,j) = A(i,j) * B(j,j) * scalar;
7348 }
7349 if( IsUpper_v<MT4> ) {
7350 for( size_t i=ipos; i<iend; ++i ) {
7351 reset( C(i,j) );
7352 }
7353 }
7354 }
7355 }
7356 }
7357 }
7358 //**********************************************************************************************
7359
7360 //**Default assignment to row-major dense matrices (diagonal/general)***************************
7374 template< typename MT3 // Type of the left-hand side target matrix
7375 , typename MT4 // Type of the left-hand side matrix operand
7376 , typename MT5 // Type of the right-hand side matrix operand
7377 , typename ST2 > // Type of the scalar value
7378 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7379 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7380 {
7381 constexpr size_t block( BLOCK_SIZE );
7382
7383 const size_t M( A.rows() );
7384 const size_t N( B.columns() );
7385
7386 for( size_t ii=0UL; ii<M; ii+=block ) {
7387 const size_t iend( min( M, ii+block ) );
7388 for( size_t jj=0UL; jj<N; jj+=block ) {
7389 const size_t jend( min( N, jj+block ) );
7390 for( size_t i=ii; i<iend; ++i )
7391 {
7392 const size_t jbegin( ( IsUpper_v<MT5> )
7393 ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
7394 :( jj ) );
7395 const size_t jpos( ( IsLower_v<MT5> )
7396 ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
7397 :( jend ) );
7398
7399 if( IsUpper_v<MT5> ) {
7400 for( size_t j=jj; j<jbegin; ++j ) {
7401 reset( C(i,j) );
7402 }
7403 }
7404 for( size_t j=jbegin; j<jpos; ++j ) {
7405 C(i,j) = A(i,i) * B(i,j) * scalar;
7406 }
7407 if( IsLower_v<MT5> ) {
7408 for( size_t j=jpos; j<jend; ++j ) {
7409 reset( C(i,j) );
7410 }
7411 }
7412 }
7413 }
7414 }
7415 }
7416 //**********************************************************************************************
7417
7418 //**Default assignment to column-major dense matrices (diagonal/general)************************
7432 template< typename MT3 // Type of the left-hand side target matrix
7433 , typename MT4 // Type of the left-hand side matrix operand
7434 , typename MT5 // Type of the right-hand side matrix operand
7435 , typename ST2 > // Type of the scalar value
7436 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7437 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7438 {
7439 const size_t M( A.rows() );
7440 const size_t N( B.columns() );
7441
7442 for( size_t j=0UL; j<N; ++j )
7443 {
7444 const size_t ibegin( ( IsLower_v<MT5> )
7445 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7446 :( 0UL ) );
7447 const size_t iend( ( IsUpper_v<MT5> )
7448 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7449 :( M ) );
7450 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7451
7452 if( IsLower_v<MT5> ) {
7453 for( size_t i=0UL; i<ibegin; ++i ) {
7454 reset( C(i,j) );
7455 }
7456 }
7457 for( size_t i=ibegin; i<iend; ++i ) {
7458 C(i,j) = A(i,i) * B(i,j) * scalar;
7459 }
7460 if( IsUpper_v<MT5> ) {
7461 for( size_t i=iend; i<M; ++i ) {
7462 reset( C(i,j) );
7463 }
7464 }
7465 }
7466 }
7467 //**********************************************************************************************
7468
7469 //**Default assignment to dense matrices (diagonal/diagonal)************************************
7483 template< typename MT3 // Type of the left-hand side target matrix
7484 , typename MT4 // Type of the left-hand side matrix operand
7485 , typename MT5 // Type of the right-hand side matrix operand
7486 , typename ST2 > // Type of the scalar value
7487 static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7488 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7489 {
7490 reset( C );
7491
7492 for( size_t i=0UL; i<A.rows(); ++i ) {
7493 C(i,i) = A(i,i) * B(i,i) * scalar;
7494 }
7495 }
7496 //**********************************************************************************************
7497
7498 //**Default assignment to dense matrices (small matrices)***************************************
7512 template< typename MT3 // Type of the left-hand side target matrix
7513 , typename MT4 // Type of the left-hand side matrix operand
7514 , typename MT5 // Type of the right-hand side matrix operand
7515 , typename ST2 > // Type of the scalar value
7516 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7517 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7518 {
7519 selectDefaultAssignKernel( C, A, B, scalar );
7520 }
7521 //**********************************************************************************************
7522
7523 //**Vectorized default assignment to row-major dense matrices (small matrices)******************
7538 template< typename MT3 // Type of the left-hand side target matrix
7539 , typename MT4 // Type of the left-hand side matrix operand
7540 , typename MT5 // Type of the right-hand side matrix operand
7541 , typename ST2 > // Type of the scalar value
7542 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7543 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7544 {
7545 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
7546
7547 const size_t M( A.rows() );
7548 const size_t N( B.columns() );
7549 const size_t K( A.columns() );
7550
7551 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7552
7553 size_t i( 0UL );
7554
7555 for( ; !( LOW && UPP ) && (i+3UL) <= M; i+=3UL )
7556 {
7557 const size_t jend( LOW ? i+3UL : N );
7558 size_t j( 0UL );
7559
7560 if( SYM || HERM ) {
7561 for( ; j<i; ++j ) {
7562 C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
7563 C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
7564 C(i+2UL,j) = HERM ? conj( C(j,i+2UL) ) : C(j,i+2UL);
7565 }
7566 }
7567 else if( UPP ) {
7568 for( ; j<i; ++j ) {
7569 reset( C(i ,j) );
7570 reset( C(i+1UL,j) );
7571 reset( C(i+2UL,j) );
7572 }
7573 }
7574
7575 for( ; (j+3UL) <= jend; j+=3UL )
7576 {
7577 const size_t kbegin( ( IsUpper_v<MT4> )
7578 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
7579 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
7580 const size_t kend( ( IsLower_v<MT4> )
7581 ?( IsUpper_v<MT5> ? min( i+3UL, j+3UL ) : ( i+3UL ) )
7582 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
7583
7584 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
7585 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
7586
7587 size_t k( kbegin );
7588
7589 if( k < kpos )
7590 {
7591 SIMDType a1( A.load(i ,k) );
7592 SIMDType a2( A.load(i+1UL,k) );
7593 SIMDType a3( A.load(i+2UL,k) );
7594 SIMDType b1( B.load(k,j ) );
7595 SIMDType b2( B.load(k,j+1UL) );
7596 SIMDType b3( B.load(k,j+2UL) );
7597 SIMDType xmm1( a1 * b1 );
7598 SIMDType xmm2( a1 * b2 );
7599 SIMDType xmm3( a1 * b3 );
7600 SIMDType xmm4( a2 * b1 );
7601 SIMDType xmm5( a2 * b2 );
7602 SIMDType xmm6( a2 * b3 );
7603 SIMDType xmm7( a3 * b1 );
7604 SIMDType xmm8( a3 * b2 );
7605 SIMDType xmm9( a3 * b3 );
7606
7607 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
7608 a1 = A.load(i ,k);
7609 a2 = A.load(i+1UL,k);
7610 a3 = A.load(i+2UL,k);
7611 b1 = B.load(k,j );
7612 b2 = B.load(k,j+1UL);
7613 b3 = B.load(k,j+2UL);
7614 xmm1 += a1 * b1;
7615 xmm2 += a1 * b2;
7616 xmm3 += a1 * b3;
7617 xmm4 += a2 * b1;
7618 xmm5 += a2 * b2;
7619 xmm6 += a2 * b3;
7620 xmm7 += a3 * b1;
7621 xmm8 += a3 * b2;
7622 xmm9 += a3 * b3;
7623 }
7624
7625 C(i ,j ) = sum( xmm1 ) * scalar;
7626 C(i ,j+1UL) = sum( xmm2 ) * scalar;
7627 C(i ,j+2UL) = sum( xmm3 ) * scalar;
7628 C(i+1UL,j ) = sum( xmm4 ) * scalar;
7629 C(i+1UL,j+1UL) = sum( xmm5 ) * scalar;
7630 C(i+1UL,j+2UL) = sum( xmm6 ) * scalar;
7631 C(i+2UL,j ) = sum( xmm7 ) * scalar;
7632 C(i+2UL,j+1UL) = sum( xmm8 ) * scalar;
7633 C(i+2UL,j+2UL) = sum( xmm9 ) * scalar;
7634
7635 for( ; remainder && k<kend; ++k ) {
7636 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
7637 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
7638 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
7639 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
7640 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
7641 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
7642 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
7643 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
7644 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL) * scalar;
7645 }
7646 }
7647 else if( k < kend )
7648 {
7649 ElementType value1( A(i ,k) * B(k,j ) );
7650 ElementType value2( A(i ,k) * B(k,j+1UL) );
7651 ElementType value3( A(i ,k) * B(k,j+2UL) );
7652 ElementType value4( A(i+1UL,k) * B(k,j ) );
7653 ElementType value5( A(i+1UL,k) * B(k,j+1UL) );
7654 ElementType value6( A(i+1UL,k) * B(k,j+2UL) );
7655 ElementType value7( A(i+2UL,k) * B(k,j ) );
7656 ElementType value8( A(i+2UL,k) * B(k,j+1UL) );
7657 ElementType value9( A(i+2UL,k) * B(k,j+2UL) );
7658
7659 for( ++k; k<kend; ++k ) {
7660 value1 += A(i ,k) * B(k,j );
7661 value2 += A(i ,k) * B(k,j+1UL);
7662 value3 += A(i ,k) * B(k,j+2UL);
7663 value4 += A(i+1UL,k) * B(k,j );
7664 value5 += A(i+1UL,k) * B(k,j+1UL);
7665 value6 += A(i+1UL,k) * B(k,j+2UL);
7666 value7 += A(i+2UL,k) * B(k,j );
7667 value8 += A(i+2UL,k) * B(k,j+1UL);
7668 value9 += A(i+2UL,k) * B(k,j+2UL);
7669 }
7670
7671 C(i ,j ) = value1 * scalar;
7672 C(i ,j+1UL) = value2 * scalar;
7673 C(i ,j+2UL) = value3 * scalar;
7674 C(i+1UL,j ) = value4 * scalar;
7675 C(i+1UL,j+1UL) = value5 * scalar;
7676 C(i+1UL,j+2UL) = value6 * scalar;
7677 C(i+2UL,j ) = value7 * scalar;
7678 C(i+2UL,j+1UL) = value8 * scalar;
7679 C(i+2UL,j+2UL) = value9 * scalar;
7680 }
7681 else
7682 {
7683 reset( C(i ,j ) );
7684 reset( C(i ,j+1UL) );
7685 reset( C(i ,j+2UL) );
7686 reset( C(i+1UL,j ) );
7687 reset( C(i+1UL,j+1UL) );
7688 reset( C(i+1UL,j+2UL) );
7689 reset( C(i+2UL,j ) );
7690 reset( C(i+2UL,j+1UL) );
7691 reset( C(i+2UL,j+2UL) );
7692 }
7693 }
7694
7695 for( ; (j+2UL) <= jend; j+=2UL )
7696 {
7697 const size_t kbegin( ( IsUpper_v<MT4> )
7698 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
7699 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
7700 const size_t kend( ( IsLower_v<MT4> )
7701 ?( IsUpper_v<MT5> ? min( i+3UL, j+2UL ) : ( i+3UL ) )
7702 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7703
7704 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
7705 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
7706
7707 size_t k( kbegin );
7708
7709 if( k < kpos )
7710 {
7711 SIMDType a1( A.load(i ,k) );
7712 SIMDType a2( A.load(i+1UL,k) );
7713 SIMDType a3( A.load(i+2UL,k) );
7714 SIMDType b1( B.load(k,j ) );
7715 SIMDType b2( B.load(k,j+1UL) );
7716 SIMDType xmm1( a1 * b1 );
7717 SIMDType xmm2( a1 * b2 );
7718 SIMDType xmm3( a2 * b1 );
7719 SIMDType xmm4( a2 * b2 );
7720 SIMDType xmm5( a3 * b1 );
7721 SIMDType xmm6( a3 * b2 );
7722
7723 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
7724 a1 = A.load(i ,k);
7725 a2 = A.load(i+1UL,k);
7726 a3 = A.load(i+2UL,k);
7727 b1 = B.load(k,j );
7728 b2 = B.load(k,j+1UL);
7729 xmm1 += a1 * b1;
7730 xmm2 += a1 * b2;
7731 xmm3 += a2 * b1;
7732 xmm4 += a2 * b2;
7733 xmm5 += a3 * b1;
7734 xmm6 += a3 * b2;
7735 }
7736
7737 C(i ,j ) = sum( xmm1 ) * scalar;
7738 C(i ,j+1UL) = sum( xmm2 ) * scalar;
7739 C(i+1UL,j ) = sum( xmm3 ) * scalar;
7740 C(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
7741 C(i+2UL,j ) = sum( xmm5 ) * scalar;
7742 C(i+2UL,j+1UL) = sum( xmm6 ) * scalar;
7743
7744 for( ; remainder && k<kend; ++k ) {
7745 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
7746 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
7747 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
7748 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
7749 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
7750 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
7751 }
7752 }
7753 else if( k < kend )
7754 {
7755 ElementType value1( A(i ,k) * B(k,j ) );
7756 ElementType value2( A(i ,k) * B(k,j+1UL) );
7757 ElementType value3( A(i+1UL,k) * B(k,j ) );
7758 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
7759 ElementType value5( A(i+2UL,k) * B(k,j ) );
7760 ElementType value6( A(i+2UL,k) * B(k,j+1UL) );
7761
7762 for( ++k; k<kend; ++k ) {
7763 value1 += A(i ,k) * B(k,j );
7764 value2 += A(i ,k) * B(k,j+1UL);
7765 value3 += A(i+1UL,k) * B(k,j );
7766 value4 += A(i+1UL,k) * B(k,j+1UL);
7767 value5 += A(i+2UL,k) * B(k,j );
7768 value6 += A(i+2UL,k) * B(k,j+1UL);
7769 }
7770
7771 C(i ,j ) = value1 * scalar;
7772 C(i ,j+1UL) = value2 * scalar;
7773 C(i+1UL,j ) = value3 * scalar;
7774 C(i+1UL,j+1UL) = value4 * scalar;
7775 C(i+2UL,j ) = value5 * scalar;
7776 C(i+2UL,j+1UL) = value6 * scalar;
7777 }
7778 else
7779 {
7780 reset( C(i ,j ) );
7781 reset( C(i ,j+1UL) );
7782 reset( C(i+1UL,j ) );
7783 reset( C(i+1UL,j+1UL) );
7784 reset( C(i+2UL,j ) );
7785 reset( C(i+2UL,j+1UL) );
7786 }
7787 }
7788
7789 if( j < jend )
7790 {
7791 const size_t kbegin( ( IsUpper_v<MT4> )
7792 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
7793 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
7794 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
7795
7796 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
7797 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
7798
7799 size_t k( kbegin );
7800
7801 if( k < kpos )
7802 {
7803 SIMDType b1( B.load(k,j) );
7804 SIMDType xmm1( A.load(i ,k) * b1 );
7805 SIMDType xmm2( A.load(i+1UL,k) * b1 );
7806 SIMDType xmm3( A.load(i+2UL,k) * b1 );
7807
7808 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
7809 b1 = B.load(k,j);
7810 xmm1 += A.load(i ,k) * b1;
7811 xmm2 += A.load(i+1UL,k) * b1;
7812 xmm3 += A.load(i+2UL,k) * b1;
7813 }
7814
7815 C(i ,j) = sum( xmm1 ) * scalar;
7816 C(i+1UL,j) = sum( xmm2 ) * scalar;
7817 C(i+2UL,j) = sum( xmm3 ) * scalar;
7818
7819 for( ; remainder && k<kend; ++k ) {
7820 C(i ,j) += A(i ,k) * B(k,j) * scalar;
7821 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
7822 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
7823 }
7824 }
7825 else if( k < kend )
7826 {
7827 ElementType value1( A(i ,k) * B(k,j) );
7828 ElementType value2( A(i+1UL,k) * B(k,j) );
7829 ElementType value3( A(i+2UL,k) * B(k,j) );
7830
7831 for( ++k; k<kend; ++k ) {
7832 value1 += A(i ,k) * B(k,j);
7833 value2 += A(i+1UL,k) * B(k,j);
7834 value3 += A(i+2UL,k) * B(k,j);
7835 }
7836
7837 C(i ,j) = value1 * scalar;
7838 C(i+1UL,j) = value2 * scalar;
7839 C(i+2UL,j) = value3 * scalar;
7840 }
7841 else
7842 {
7843 reset( C(i ,j) );
7844 reset( C(i+1UL,j) );
7845 reset( C(i+2UL,j) );
7846 }
7847
7848 if( LOW ) ++j;
7849 }
7850
7851 if( LOW ) {
7852 for( ; j<N; ++j ) {
7853 reset( C(i ,j) );
7854 reset( C(i+1UL,j) );
7855 reset( C(i+2UL,j) );
7856 }
7857 }
7858 }
7859
7860 for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
7861 {
7862 const size_t jend( LOW ? i+2UL : N );
7863 size_t j( 0UL );
7864
7865 if( SYM || HERM ) {
7866 for( ; j<i; ++j ) {
7867 C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
7868 C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
7869 }
7870 }
7871 else if( UPP ) {
7872 for( ; j<i; ++j ) {
7873 reset( C(i ,j) );
7874 reset( C(i+1UL,j) );
7875 }
7876 }
7877
7878 for( ; (j+4UL) <= jend; j+=4UL )
7879 {
7880 const size_t kbegin( ( IsUpper_v<MT4> )
7881 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
7882 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
7883 const size_t kend( ( IsLower_v<MT4> )
7884 ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
7885 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
7886
7887 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
7888 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
7889
7890 size_t k( kbegin );
7891
7892 if( k < kpos )
7893 {
7894 SIMDType a1( A.load(i ,k) );
7895 SIMDType a2( A.load(i+1UL,k) );
7896 SIMDType b1( B.load(k,j ) );
7897 SIMDType b2( B.load(k,j+1UL) );
7898 SIMDType b3( B.load(k,j+2UL) );
7899 SIMDType b4( B.load(k,j+3UL) );
7900 SIMDType xmm1( a1 * b1 );
7901 SIMDType xmm2( a1 * b2 );
7902 SIMDType xmm3( a1 * b3 );
7903 SIMDType xmm4( a1 * b4 );
7904 SIMDType xmm5( a2 * b1 );
7905 SIMDType xmm6( a2 * b2 );
7906 SIMDType xmm7( a2 * b3 );
7907 SIMDType xmm8( a2 * b4 );
7908
7909 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
7910 a1 = A.load(i ,k);
7911 a2 = A.load(i+1UL,k);
7912 b1 = B.load(k,j );
7913 b2 = B.load(k,j+1UL);
7914 b3 = B.load(k,j+2UL);
7915 b4 = B.load(k,j+3UL);
7916 xmm1 += a1 * b1;
7917 xmm2 += a1 * b2;
7918 xmm3 += a1 * b3;
7919 xmm4 += a1 * b4;
7920 xmm5 += a2 * b1;
7921 xmm6 += a2 * b2;
7922 xmm7 += a2 * b3;
7923 xmm8 += a2 * b4;
7924 }
7925
7926 C(i ,j ) = sum( xmm1 ) * scalar;
7927 C(i ,j+1UL) = sum( xmm2 ) * scalar;
7928 C(i ,j+2UL) = sum( xmm3 ) * scalar;
7929 C(i ,j+3UL) = sum( xmm4 ) * scalar;
7930 C(i+1UL,j ) = sum( xmm5 ) * scalar;
7931 C(i+1UL,j+1UL) = sum( xmm6 ) * scalar;
7932 C(i+1UL,j+2UL) = sum( xmm7 ) * scalar;
7933 C(i+1UL,j+3UL) = sum( xmm8 ) * scalar;
7934
7935 for( ; remainder && k<kend; ++k ) {
7936 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
7937 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
7938 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
7939 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
7940 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
7941 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
7942 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
7943 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
7944 }
7945 }
7946 else if( k < kend )
7947 {
7948 ElementType value1( A(i ,k) * B(k,j ) );
7949 ElementType value2( A(i ,k) * B(k,j+1UL) );
7950 ElementType value3( A(i ,k) * B(k,j+2UL) );
7951 ElementType value4( A(i ,k) * B(k,j+3UL) );
7952 ElementType value5( A(i+1UL,k) * B(k,j ) );
7953 ElementType value6( A(i+1UL,k) * B(k,j+1UL) );
7954 ElementType value7( A(i+1UL,k) * B(k,j+2UL) );
7955 ElementType value8( A(i+1UL,k) * B(k,j+3UL) );
7956
7957 for( ++k; k<kend; ++k ) {
7958 value1 += A(i ,k) * B(k,j );
7959 value2 += A(i ,k) * B(k,j+1UL);
7960 value3 += A(i ,k) * B(k,j+2UL);
7961 value4 += A(i ,k) * B(k,j+3UL);
7962 value5 += A(i+1UL,k) * B(k,j );
7963 value6 += A(i+1UL,k) * B(k,j+1UL);
7964 value7 += A(i+1UL,k) * B(k,j+2UL);
7965 value8 += A(i+1UL,k) * B(k,j+3UL);
7966 }
7967
7968 C(i ,j ) = value1 * scalar;
7969 C(i ,j+1UL) = value2 * scalar;
7970 C(i ,j+2UL) = value3 * scalar;
7971 C(i ,j+3UL) = value4 * scalar;
7972 C(i+1UL,j ) = value5 * scalar;
7973 C(i+1UL,j+1UL) = value6 * scalar;
7974 C(i+1UL,j+2UL) = value7 * scalar;
7975 C(i+1UL,j+3UL) = value8 * scalar;
7976 }
7977 else
7978 {
7979 reset( C(i ,j ) );
7980 reset( C(i ,j+1UL) );
7981 reset( C(i ,j+2UL) );
7982 reset( C(i ,j+3UL) );
7983 reset( C(i+1UL,j ) );
7984 reset( C(i+1UL,j+1UL) );
7985 reset( C(i+1UL,j+2UL) );
7986 reset( C(i+1UL,j+3UL) );
7987 }
7988 }
7989
7990 for( ; (j+2UL) <= jend; j+=2UL )
7991 {
7992 const size_t kbegin( ( IsUpper_v<MT4> )
7993 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
7994 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
7995 const size_t kend( ( IsLower_v<MT4> )
7996 ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
7997 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7998
7999 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
8000 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
8001
8002 size_t k( kbegin );
8003
8004 if( k < kpos )
8005 {
8006 SIMDType a1( A.load(i ,k) );
8007 SIMDType a2( A.load(i+1UL,k) );
8008 SIMDType b1( B.load(k,j ) );
8009 SIMDType b2( B.load(k,j+1UL) );
8010 SIMDType xmm1( a1 * b1 );
8011 SIMDType xmm2( a1 * b2 );
8012 SIMDType xmm3( a2 * b1 );
8013 SIMDType xmm4( a2 * b2 );
8014
8015 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
8016 a1 = A.load(i ,k);
8017 a2 = A.load(i+1UL,k);
8018 b1 = B.load(k,j );
8019 b2 = B.load(k,j+1UL);
8020 xmm1 += a1 * b1;
8021 xmm2 += a1 * b2;
8022 xmm3 += a2 * b1;
8023 xmm4 += a2 * b2;
8024 }
8025
8026 C(i ,j ) = sum( xmm1 ) * scalar;
8027 C(i ,j+1UL) = sum( xmm2 ) * scalar;
8028 C(i+1UL,j ) = sum( xmm3 ) * scalar;
8029 C(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
8030
8031 for( ; remainder && k<kend; ++k ) {
8032 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
8033 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
8034 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
8035 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
8036 }
8037 }
8038 else if( k < kend )
8039 {
8040 ElementType value1( A(i ,k) * B(k,j ) );
8041 ElementType value2( A(i ,k) * B(k,j+1UL) );
8042 ElementType value3( A(i+1UL,k) * B(k,j ) );
8043 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
8044
8045 for( ++k; k<kend; ++k ) {
8046 value1 += A(i ,k) * B(k,j );
8047 value2 += A(i ,k) * B(k,j+1UL);
8048 value3 += A(i+1UL,k) * B(k,j );
8049 value4 += A(i+1UL,k) * B(k,j+1UL);
8050 }
8051
8052 C(i ,j ) = value1 * scalar;
8053 C(i ,j+1UL) = value2 * scalar;
8054 C(i+1UL,j ) = value3 * scalar;
8055 C(i+1UL,j+1UL) = value4 * scalar;
8056 }
8057 else
8058 {
8059 reset( C(i ,j ) );
8060 reset( C(i ,j+1UL) );
8061 reset( C(i+1UL,j ) );
8062 reset( C(i+1UL,j+1UL) );
8063 }
8064 }
8065
8066 if( j < jend )
8067 {
8068 const size_t kbegin( ( IsUpper_v<MT4> )
8069 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
8070 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
8071 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
8072
8073 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
8074 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
8075
8076 size_t k( kbegin );
8077
8078 if( k < kpos )
8079 {
8080 SIMDType b1( B.load(k,j) );
8081 SIMDType xmm1( A.load(i ,k) * b1 );
8082 SIMDType xmm2( A.load(i+1UL,k) * b1 );
8083
8084 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
8085 b1 = B.load(k,j);
8086 xmm1 += A.load(i ,k) * b1;
8087 xmm2 += A.load(i+1UL,k) * b1;
8088 }
8089
8090 C(i ,j) = sum( xmm1 ) * scalar;
8091 C(i+1UL,j) = sum( xmm2 ) * scalar;
8092
8093 for( ; remainder && k<kend; ++k ) {
8094 C(i ,j) += A(i ,k) * B(k,j) * scalar;
8095 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
8096 }
8097 }
8098 else if( k < kend )
8099 {
8100 ElementType value1( A(i ,k) * B(k,j) );
8101 ElementType value2( A(i+1UL,k) * B(k,j) );
8102
8103 for( ++k; k<kend; ++k ) {
8104 value1 += A(i ,k) * B(k,j);
8105 value2 += A(i+1UL,k) * B(k,j);
8106 }
8107
8108 C(i ,j) = value1 * scalar;
8109 C(i+1UL,j) = value2 * scalar;
8110 }
8111 else
8112 {
8113 reset( C(i ,j) );
8114 reset( C(i+1UL,j) );
8115 }
8116
8117 if( LOW ) ++j;
8118 }
8119
8120 if( LOW ) {
8121 for( ; j<N; ++j ) {
8122 reset( C(i ,j) );
8123 reset( C(i+1UL,j) );
8124 }
8125 }
8126 }
8127
8128 for( ; i<M; ++i )
8129 {
8130 const size_t jend( LOW ? i+1UL : N );
8131 size_t j( 0UL );
8132
8133 if( SYM || HERM ) {
8134 for( ; j<i; ++j ) {
8135 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
8136 }
8137 }
8138 else if( UPP ) {
8139 for( ; j<i; ++j ) {
8140 reset( C(i,j) );
8141 }
8142 }
8143
8144 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
8145 {
8146 const size_t kbegin( ( IsUpper_v<MT4> )
8147 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
8148 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
8149 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
8150
8151 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
8152 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
8153
8154 size_t k( kbegin );
8155
8156 if( k < kpos )
8157 {
8158 SIMDType a1( A.load(i,k) );
8159 SIMDType xmm1( a1 * B.load(k,j ) );
8160 SIMDType xmm2( a1 * B.load(k,j+1UL) );
8161 SIMDType xmm3( a1 * B.load(k,j+2UL) );
8162 SIMDType xmm4( a1 * B.load(k,j+3UL) );
8163
8164 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
8165 a1 = A.load(i,k);
8166 xmm1 += a1 * B.load(k,j );
8167 xmm2 += a1 * B.load(k,j+1UL);
8168 xmm3 += a1 * B.load(k,j+2UL);
8169 xmm4 += a1 * B.load(k,j+3UL);
8170 }
8171
8172 C(i,j ) = sum( xmm1 ) * scalar;
8173 C(i,j+1UL) = sum( xmm2 ) * scalar;
8174 C(i,j+2UL) = sum( xmm3 ) * scalar;
8175 C(i,j+3UL) = sum( xmm4 ) * scalar;
8176
8177 for( ; remainder && k<kend; ++k ) {
8178 C(i,j ) += A(i,k) * B(k,j ) * scalar;
8179 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
8180 C(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
8181 C(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
8182 }
8183 }
8184 else if( k < kend )
8185 {
8186 ElementType value1( A(i,k) * B(k,j ) );
8187 ElementType value2( A(i,k) * B(k,j+1UL) );
8188 ElementType value3( A(i,k) * B(k,j+2UL) );
8189 ElementType value4( A(i,k) * B(k,j+3UL) );
8190
8191 for( ++k; k<kend; ++k ) {
8192 value1 += A(i,k) * B(k,j );
8193 value2 += A(i,k) * B(k,j+1UL);
8194 value3 += A(i,k) * B(k,j+2UL);
8195 value4 += A(i,k) * B(k,j+3UL);
8196 }
8197
8198 C(i,j ) = value1 * scalar;
8199 C(i,j+1UL) = value2 * scalar;
8200 C(i,j+2UL) = value3 * scalar;
8201 C(i,j+3UL) = value4 * scalar;
8202 }
8203 else
8204 {
8205 reset( C(i,j ) );
8206 reset( C(i,j+1UL) );
8207 reset( C(i,j+2UL) );
8208 reset( C(i,j+3UL) );
8209 }
8210 }
8211
8212 for( ; !( LOW && UPP ) && (j+2UL) <= jend; j+=2UL )
8213 {
8214 const size_t kbegin( ( IsUpper_v<MT4> )
8215 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
8216 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
8217 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
8218
8219 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
8220 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
8221
8222 size_t k( kbegin );
8223
8224 if( k < kpos )
8225 {
8226 SIMDType a1( A.load(i,k) );
8227 SIMDType xmm1( a1 * B.load(k,j ) );
8228 SIMDType xmm2( a1 * B.load(k,j+1UL) );
8229
8230 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
8231 a1 = A.load(i,k);
8232 xmm1 += a1 * B.load(k,j );
8233 xmm2 += a1 * B.load(k,j+1UL);
8234 }
8235
8236 C(i,j ) = sum( xmm1 ) * scalar;
8237 C(i,j+1UL) = sum( xmm2 ) * scalar;
8238
8239 for( ; remainder && k<kend; ++k ) {
8240 C(i,j ) += A(i,k) * B(k,j ) * scalar;
8241 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
8242 }
8243 }
8244 else if( k < kend )
8245 {
8246 ElementType value1( A(i,k) * B(k,j ) );
8247 ElementType value2( A(i,k) * B(k,j+1UL) );
8248
8249 for( ++k; k<kend; ++k ) {
8250 value1 += A(i,k) * B(k,j );
8251 value2 += A(i,k) * B(k,j+1UL);
8252 }
8253
8254 C(i,j ) = value1 * scalar;
8255 C(i,j+1UL) = value2 * scalar;
8256 }
8257 else
8258 {
8259 reset( C(i,j ) );
8260 reset( C(i,j+1UL) );
8261 }
8262 }
8263
8264 if( j < jend )
8265 {
8266 const size_t kbegin( ( IsUpper_v<MT4> )
8267 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
8268 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
8269
8270 const size_t kpos( remainder ? prevMultiple( K, SIMDSIZE ) : K );
8271 BLAZE_INTERNAL_ASSERT( kpos <= K, "Invalid end calculation" );
8272
8273 size_t k( kbegin );
8274
8275 if( k < kpos )
8276 {
8277 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
8278
8279 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
8280 xmm1 += A.load(i,k) * B.load(k,j);
8281 }
8282
8283 C(i,j) = sum( xmm1 ) * scalar;
8284
8285 for( ; remainder && k<K; ++k ) {
8286 C(i,j) += A(i,k) * B(k,j) * scalar;
8287 }
8288 }
8289 else if( k < K )
8290 {
8291 ElementType value( A(i,k) * B(k,j) );
8292
8293 for( ++k; k<K; ++k ) {
8294 value += A(i,k) * B(k,j);
8295 }
8296
8297 C(i,j) = value * scalar;
8298 }
8299 else
8300 {
8301 reset( C(i,j) );
8302 }
8303
8304 if( LOW ) ++j;
8305 }
8306
8307 if( LOW ) {
8308 for( ; j<N; ++j ) {
8309 reset( C(i,j) );
8310 }
8311 }
8312 }
8313 }
8314 //**********************************************************************************************
8315
8316 //**Vectorized default assignment to column-major dense matrices (small matrices)***************
8331 template< typename MT3 // Type of the left-hand side target matrix
8332 , typename MT4 // Type of the left-hand side matrix operand
8333 , typename MT5 // Type of the right-hand side matrix operand
8334 , typename ST2 > // Type of the scalar value
8335 static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8336 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8337 {
8338 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
8339
8340 const size_t M( A.rows() );
8341 const size_t N( B.columns() );
8342 const size_t K( A.columns() );
8343
8344 BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
8345
8346 size_t i( 0UL );
8347
8348 for( ; !( LOW && UPP ) && (i+4UL) <= M; i+=4UL )
8349 {
8350 const size_t jend( LOW ? i+4UL : N );
8351 size_t j( 0UL );
8352
8353 if( SYM || HERM ) {
8354 for( ; j<i; ++j ) {
8355 C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
8356 C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
8357 C(i+2UL,j) = HERM ? conj( C(j,i+2UL) ) : C(j,i+2UL);
8358 C(i+3UL,j) = HERM ? conj( C(j,i+3UL) ) : C(j,i+3UL);
8359 }
8360 }
8361 else if( UPP ) {
8362 for( ; j<i; ++j ) {
8363 reset( C(i ,j) );
8364 reset( C(i+1UL,j) );
8365 reset( C(i+2UL,j) );
8366 reset( C(i+3UL,j) );
8367 }
8368 }
8369
8370 for( ; (j+2UL) <= jend; j+=2UL )
8371 {
8372 const size_t kbegin( ( IsUpper_v<MT4> )
8373 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
8374 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
8375 const size_t kend( ( IsLower_v<MT4> )
8376 ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
8377 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
8378
8379 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
8380 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
8381
8382 size_t k( kbegin );
8383
8384 if( k < kpos )
8385 {
8386 SIMDType a1( A.load(i ,k) );
8387 SIMDType a2( A.load(i+1UL,k) );
8388 SIMDType a3( A.load(i+2UL,k) );
8389 SIMDType a4( A.load(i+3UL,k) );
8390 SIMDType b1( B.load(k,j ) );
8391 SIMDType b2( B.load(k,j+1UL) );
8392 SIMDType xmm1( a1 * b1 );
8393 SIMDType xmm2( a1 * b2 );
8394 SIMDType xmm3( a2 * b1 );
8395 SIMDType xmm4( a2 * b2 );
8396 SIMDType xmm5( a3 * b1 );
8397 SIMDType xmm6( a3 * b2 );
8398 SIMDType xmm7( a4 * b1 );
8399 SIMDType xmm8( a4 * b2 );
8400
8401 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
8402 a1 = A.load(i ,k);
8403 a2 = A.load(i+1UL,k);
8404 a3 = A.load(i+2UL,k);
8405 a4 = A.load(i+3UL,k);
8406 b1 = B.load(k,j );
8407 b2 = B.load(k,j+1UL);
8408 xmm1 += a1 * b1;
8409 xmm2 += a1 * b2;
8410 xmm3 += a2 * b1;
8411 xmm4 += a2 * b2;
8412 xmm5 += a3 * b1;
8413 xmm6 += a3 * b2;
8414 xmm7 += a4 * b1;
8415 xmm8 += a4 * b2;
8416 }
8417
8418 C(i ,j ) = sum( xmm1 ) * scalar;
8419 C(i ,j+1UL) = sum( xmm2 ) * scalar;
8420 C(i+1UL,j ) = sum( xmm3 ) * scalar;
8421 C(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
8422 C(i+2UL,j ) = sum( xmm5 ) * scalar;
8423 C(i+2UL,j+1UL) = sum( xmm6 ) * scalar;
8424 C(i+3UL,j ) = sum( xmm7 ) * scalar;
8425 C(i+3UL,j+1UL) = sum( xmm8 ) * scalar;
8426
8427 for( ; remainder && k<kend; ++k ) {
8428 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
8429 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
8430 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
8431 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
8432 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
8433 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
8434 C(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
8435 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
8436 }
8437 }
8438 else if( k < kend )
8439 {
8440 ElementType value1( A(i ,k) * B(k,j ) );
8441 ElementType value2( A(i ,k) * B(k,j+1UL) );
8442 ElementType value3( A(i+1UL,k) * B(k,j ) );
8443 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
8444 ElementType value5( A(i+2UL,k) * B(k,j ) );
8445 ElementType value6( A(i+2UL,k) * B(k,j+1UL) );
8446 ElementType value7( A(i+3UL,k) * B(k,j ) );
8447 ElementType value8( A(i+3UL,k) * B(k,j+1UL) );
8448
8449 for( ++k; k<kend; ++k ) {
8450 value1 += A(i ,k) * B(k,j );
8451 value2 += A(i ,k) * B(k,j+1UL);
8452 value3 += A(i+1UL,k) * B(k,j );
8453 value4 += A(i+1UL,k) * B(k,j+1UL);
8454 value5 += A(i+2UL,k) * B(k,j );
8455 value6 += A(i+2UL,k) * B(k,j+1UL);
8456 value7 += A(i+3UL,k) * B(k,j );
8457 value8 += A(i+3UL,k) * B(k,j+1UL);
8458 }
8459
8460 C(i ,j ) = value1 * scalar;
8461 C(i ,j+1UL) = value2 * scalar;
8462 C(i+1UL,j ) = value3 * scalar;
8463 C(i+1UL,j+1UL) = value4 * scalar;
8464 C(i+2UL,j ) = value5 * scalar;
8465 C(i+2UL,j+1UL) = value6 * scalar;
8466 C(i+3UL,j ) = value7 * scalar;
8467 C(i+3UL,j+1UL) = value8 * scalar;
8468 }
8469 else
8470 {
8471 reset( C(i ,j ) );
8472 reset( C(i ,j+1UL) );
8473 reset( C(i+1UL,j ) );
8474 reset( C(i+1UL,j+1UL) );
8475 reset( C(i+2UL,j ) );
8476 reset( C(i+2UL,j+1UL) );
8477 reset( C(i+3UL,j ) );
8478 reset( C(i+3UL,j+1UL) );
8479 }
8480 }
8481
8482 if( j < jend )
8483 {
8484 const size_t kbegin( ( IsUpper_v<MT4> )
8485 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
8486 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
8487 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
8488
8489 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
8490 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
8491
8492 size_t k( kbegin );
8493
8494 if( k< kpos )
8495 {
8496 SIMDType b1( B.load(k,j) );
8497 SIMDType xmm1( A.load(i ,k) * b1 );
8498 SIMDType xmm2( A.load(i+1UL,k) * b1 );
8499 SIMDType xmm3( A.load(i+2UL,k) * b1 );
8500 SIMDType xmm4( A.load(i+3UL,k) * b1 );
8501
8502 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
8503 b1 = B.load(k,j);
8504 xmm1 += A.load(i ,k) * b1;
8505 xmm2 += A.load(i+1UL,k) * b1;
8506 xmm3 += A.load(i+2UL,k) * b1;
8507 xmm4 += A.load(i+3UL,k) * b1;
8508 }
8509
8510 C(i ,j) = sum( xmm1 ) * scalar;
8511 C(i+1UL,j) = sum( xmm2 ) * scalar;
8512 C(i+2UL,j) = sum( xmm3 ) * scalar;
8513 C(i+3UL,j) = sum( xmm4 ) * scalar;
8514
8515 for( ; remainder && k<kend; ++k ) {
8516 C(i ,j) += A(i ,k) * B(k,j) * scalar;
8517 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
8518 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
8519 C(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
8520 }
8521 }
8522 else if( k < kend )
8523 {
8524 ElementType value1( A(i ,k) * B(k,j) );
8525 ElementType value2( A(i+1UL,k) * B(k,j) );
8526 ElementType value3( A(i+2UL,k) * B(k,j) );
8527 ElementType value4( A(i+3UL,k) * B(k,j) );
8528
8529 for( ++k; k<kend; ++k ) {
8530 value1 += A(i ,k) * B(k,j);
8531 value2 += A(i+1UL,k) * B(k,j);
8532 value3 += A(i+2UL,k) * B(k,j);
8533 value4 += A(i+3UL,k) * B(k,j);
8534 }
8535
8536 C(i ,j) = value1 * scalar;
8537 C(i+1UL,j) = value2 * scalar;
8538 C(i+2UL,j) = value3 * scalar;
8539 C(i+3UL,j) = value4 * scalar;
8540 }
8541 else
8542 {
8543 reset( C(i ,j) );
8544 reset( C(i+1UL,j) );
8545 reset( C(i+2UL,j) );
8546 reset( C(i+3UL,j) );
8547 }
8548
8549 if( LOW ) ++j;
8550 }
8551
8552 if( LOW ) {
8553 for( ; j<N; ++j ) {
8554 reset( C(i ,j) );
8555 reset( C(i+1UL,j) );
8556 reset( C(i+2UL,j) );
8557 reset( C(i+3UL,j) );
8558 }
8559 }
8560 }
8561
8562 for( ; !( LOW && UPP ) && (i+3UL) <= M; i+=3UL )
8563 {
8564 const size_t jend( LOW ? i+3UL : N );
8565 size_t j( 0UL );
8566
8567 if( SYM || HERM ) {
8568 for( ; j<i; ++j ) {
8569 C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
8570 C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
8571 C(i+2UL,j) = HERM ? conj( C(j,i+2UL) ) : C(j,i+2UL);
8572 }
8573 }
8574 else if( UPP ) {
8575 for( ; j<i; ++j ) {
8576 reset( C(i ,j) );
8577 reset( C(i+1UL,j) );
8578 reset( C(i+2UL,j) );
8579 }
8580 }
8581
8582 for( ; (j+3UL) <= jend; j+=3UL )
8583 {
8584 const size_t kbegin( ( IsUpper_v<MT4> )
8585 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
8586 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
8587 const size_t kend( ( IsLower_v<MT4> )
8588 ?( IsUpper_v<MT5> ? min( i+3UL, j+3UL ) : ( i+3UL ) )
8589 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
8590
8591 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
8592 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
8593
8594 size_t k( kbegin );
8595
8596 if( k < kpos )
8597 {
8598 SIMDType a1( A.load(i ,k) );
8599 SIMDType a2( A.load(i+1UL,k) );
8600 SIMDType a3( A.load(i+2UL,k) );
8601 SIMDType b1( B.load(k,j ) );
8602 SIMDType b2( B.load(k,j+1UL) );
8603 SIMDType b3( B.load(k,j+2UL) );
8604 SIMDType xmm1( a1 * b1 );
8605 SIMDType xmm2( a1 * b2 );
8606 SIMDType xmm3( a1 * b3 );
8607 SIMDType xmm4( a2 * b1 );
8608 SIMDType xmm5( a2 * b2 );
8609 SIMDType xmm6( a2 * b3 );
8610 SIMDType xmm7( a3 * b1 );
8611 SIMDType xmm8( a3 * b2 );
8612 SIMDType xmm9( a3 * b3 );
8613
8614 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
8615 a1 = A.load(i ,k);
8616 a2 = A.load(i+1UL,k);
8617 a3 = A.load(i+2UL,k);
8618 b1 = B.load(k,j );
8619 b2 = B.load(k,j+1UL);
8620 b3 = B.load(k,j+2UL);
8621 xmm1 += a1 * b1;
8622 xmm2 += a1 * b2;
8623 xmm3 += a1 * b3;
8624 xmm4 += a2 * b1;
8625 xmm5 += a2 * b2;
8626 xmm6 += a2 * b3;
8627 xmm7 += a3 * b1;
8628 xmm8 += a3 * b2;
8629 xmm9 += a3 * b3;
8630 }
8631
8632 C(i ,j ) = sum( xmm1 ) * scalar;
8633 C(i ,j+1UL) = sum( xmm2 ) * scalar;
8634 C(i ,j+2UL) = sum( xmm3 ) * scalar;
8635 C(i+1UL,j ) = sum( xmm4 ) * scalar;
8636 C(i+1UL,j+1UL) = sum( xmm5 ) * scalar;
8637 C(i+1UL,j+2UL) = sum( xmm6 ) * scalar;
8638 C(i+2UL,j ) = sum( xmm7 ) * scalar;
8639 C(i+2UL,j+1UL) = sum( xmm8 ) * scalar;
8640 C(i+2UL,j+2UL) = sum( xmm9 ) * scalar;
8641
8642 for( ; remainder && k<kend; ++k ) {
8643 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
8644 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
8645 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
8646 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
8647 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
8648 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
8649 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
8650 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
8651 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL) * scalar;
8652 }
8653 }
8654 else if( k < kend )
8655 {
8656 ElementType value1( A(i ,k) * B(k,j ) );
8657 ElementType value2( A(i ,k) * B(k,j+1UL) );
8658 ElementType value3( A(i ,k) * B(k,j+2UL) );
8659 ElementType value4( A(i+1UL,k) * B(k,j ) );
8660 ElementType value5( A(i+1UL,k) * B(k,j+1UL) );
8661 ElementType value6( A(i+1UL,k) * B(k,j+2UL) );
8662 ElementType value7( A(i+2UL,k) * B(k,j ) );
8663 ElementType value8( A(i+2UL,k) * B(k,j+1UL) );
8664 ElementType value9( A(i+2UL,k) * B(k,j+2UL) );
8665
8666 for( ++k; k<kend; ++k ) {
8667 value1 += A(i ,k) * B(k,j );
8668 value2 += A(i ,k) * B(k,j+1UL);
8669 value3 += A(i ,k) * B(k,j+2UL);
8670 value4 += A(i+1UL,k) * B(k,j );
8671 value5 += A(i+1UL,k) * B(k,j+1UL);
8672 value6 += A(i+1UL,k) * B(k,j+2UL);
8673 value7 += A(i+2UL,k) * B(k,j );
8674 value8 += A(i+2UL,k) * B(k,j+1UL);
8675 value9 += A(i+2UL,k) * B(k,j+2UL);
8676 }
8677
8678 C(i ,j ) = value1 * scalar;
8679 C(i ,j+1UL) = value2 * scalar;
8680 C(i ,j+2UL) = value3 * scalar;
8681 C(i+1UL,j ) = value4 * scalar;
8682 C(i+1UL,j+1UL) = value5 * scalar;
8683 C(i+1UL,j+2UL) = value6 * scalar;
8684 C(i+2UL,j ) = value7 * scalar;
8685 C(i+2UL,j+1UL) = value8 * scalar;
8686 C(i+2UL,j+2UL) = value9 * scalar;
8687 }
8688 else
8689 {
8690 reset( C(i ,j ) );
8691 reset( C(i ,j+1UL) );
8692 reset( C(i ,j+2UL) );
8693 reset( C(i+1UL,j ) );
8694 reset( C(i+1UL,j+1UL) );
8695 reset( C(i+1UL,j+2UL) );
8696 reset( C(i+2UL,j ) );
8697 reset( C(i+2UL,j+1UL) );
8698 reset( C(i+2UL,j+2UL) );
8699 }
8700 }
8701
8702 for( ; (j+2UL) <= jend; j+=2UL )
8703 {
8704 const size_t kbegin( ( IsUpper_v<MT4> )
8705 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
8706 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
8707 const size_t kend( ( IsLower_v<MT4> )
8708 ?( IsUpper_v<MT5> ? min( i+3UL, j+2UL ) : ( i+3UL ) )
8709 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
8710
8711 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
8712 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
8713
8714 size_t k( kbegin );
8715
8716 if( k < kpos )
8717 {
8718 SIMDType a1( A.load(i ,k) );
8719 SIMDType a2( A.load(i+1UL,k) );
8720 SIMDType a3( A.load(i+2UL,k) );
8721 SIMDType b1( B.load(k,j ) );
8722 SIMDType b2( B.load(k,j+1UL) );
8723 SIMDType xmm1( a1 * b1 );
8724 SIMDType xmm2( a1 * b2 );
8725 SIMDType xmm3( a2 * b1 );
8726 SIMDType xmm4( a2 * b2 );
8727 SIMDType xmm5( a3 * b1 );
8728 SIMDType xmm6( a3 * b2 );
8729
8730 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
8731 a1 = A.load(i ,k);
8732 a2 = A.load(i+1UL,k);
8733 a3 = A.load(i+2UL,k);
8734 b1 = B.load(k,j );
8735 b2 = B.load(k,j+1UL);
8736 xmm1 += a1 * b1;
8737 xmm2 += a1 * b2;
8738 xmm3 += a2 * b1;
8739 xmm4 += a2 * b2;
8740 xmm5 += a3 * b1;
8741 xmm6 += a3 * b2;
8742 }
8743
8744 C(i ,j ) = sum( xmm1 ) * scalar;
8745 C(i ,j+1UL) = sum( xmm2 ) * scalar;
8746 C(i+1UL,j ) = sum( xmm3 ) * scalar;
8747 C(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
8748 C(i+2UL,j ) = sum( xmm5 ) * scalar;
8749 C(i+2UL,j+1UL) = sum( xmm6 ) * scalar;
8750
8751 for( ; remainder && k<kend; ++k ) {
8752 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
8753 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
8754 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
8755 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
8756 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
8757 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
8758 }
8759 }
8760 else if( k < kend )
8761 {
8762 ElementType value1( A(i ,k) * B(k,j ) );
8763 ElementType value2( A(i ,k) * B(k,j+1UL) );
8764 ElementType value3( A(i+1UL,k) * B(k,j ) );
8765 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
8766 ElementType value5( A(i+2UL,k) * B(k,j ) );
8767 ElementType value6( A(i+2UL,k) * B(k,j+1UL) );
8768
8769 for( ++k; k<kend; ++k ) {
8770 value1 += A(i ,k) * B(k,j );
8771 value2 += A(i ,k) * B(k,j+1UL);
8772 value3 += A(i+1UL,k) * B(k,j );
8773 value4 += A(i+1UL,k) * B(k,j+1UL);
8774 value5 += A(i+2UL,k) * B(k,j );
8775 value6 += A(i+2UL,k) * B(k,j+1UL);
8776 }
8777
8778 C(i ,j ) = value1 * scalar;
8779 C(i ,j+1UL) = value2 * scalar;
8780 C(i+1UL,j ) = value3 * scalar;
8781 C(i+1UL,j+1UL) = value4 * scalar;
8782 C(i+2UL,j ) = value5 * scalar;
8783 C(i+2UL,j+1UL) = value6 * scalar;
8784 }
8785 else
8786 {
8787 reset( C(i ,j ) );
8788 reset( C(i ,j+1UL) );
8789 reset( C(i+1UL,j ) );
8790 reset( C(i+1UL,j+1UL) );
8791 reset( C(i+2UL,j ) );
8792 reset( C(i+2UL,j+1UL) );
8793 }
8794 }
8795
8796 if( j < jend )
8797 {
8798 const size_t kbegin( ( IsUpper_v<MT4> )
8799 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
8800 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
8801 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
8802
8803 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
8804 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
8805
8806 size_t k( kbegin );
8807
8808 if( k< kpos )
8809 {
8810 SIMDType b1( B.load(k,j) );
8811 SIMDType xmm1( A.load(i ,k) * b1 );
8812 SIMDType xmm2( A.load(i+1UL,k) * b1 );
8813 SIMDType xmm3( A.load(i+2UL,k) * b1 );
8814
8815 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
8816 b1 = B.load(k,j);
8817 xmm1 += A.load(i ,k) * b1;
8818 xmm2 += A.load(i+1UL,k) * b1;
8819 xmm3 += A.load(i+2UL,k) * b1;
8820 }
8821
8822 C(i ,j) = sum( xmm1 ) * scalar;
8823 C(i+1UL,j) = sum( xmm2 ) * scalar;
8824 C(i+2UL,j) = sum( xmm3 ) * scalar;
8825
8826 for( ; remainder && k<kend; ++k ) {
8827 C(i ,j) += A(i ,k) * B(k,j) * scalar;
8828 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
8829 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
8830 }
8831 }
8832 else if( k < kend )
8833 {
8834 ElementType value1( A(i ,k) * B(k,j) );
8835 ElementType value2( A(i+1UL,k) * B(k,j) );
8836 ElementType value3( A(i+2UL,k) * B(k,j) );
8837
8838 for( ++k; k<kend; ++k ) {
8839 value1 += A(i ,k) * B(k,j);
8840 value2 += A(i+1UL,k) * B(k,j);
8841 value3 += A(i+2UL,k) * B(k,j);
8842 }
8843
8844 C(i ,j) = value1 * scalar;
8845 C(i+1UL,j) = value2 * scalar;
8846 C(i+2UL,j) = value3 * scalar;
8847 }
8848 else
8849 {
8850 reset( C(i ,j) );
8851 reset( C(i+1UL,j) );
8852 reset( C(i+2UL,j) );
8853 }
8854
8855 if( LOW ) ++j;
8856 }
8857
8858 if( LOW ) {
8859 for( ; j<N; ++j ) {
8860 reset( C(i ,j) );
8861 reset( C(i+1UL,j) );
8862 reset( C(i+2UL,j) );
8863 }
8864 }
8865 }
8866
8867 for( ; (i+2UL) <= M; i+=2UL )
8868 {
8869 const size_t jend( LOW ? i+2UL : N );
8870 size_t j( 0UL );
8871
8872 if( SYM || HERM ) {
8873 for( ; j<i; ++j ) {
8874 C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
8875 C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
8876 }
8877 }
8878 else if( UPP ) {
8879 for( ; j<i; ++j ) {
8880 reset( C(i ,j) );
8881 reset( C(i+1UL,j) );
8882 }
8883 }
8884
8885 for( ; (j+2UL) <= jend; j+=2UL )
8886 {
8887 const size_t kbegin( ( IsUpper_v<MT4> )
8888 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
8889 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
8890 const size_t kend( ( IsLower_v<MT4> )
8891 ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
8892 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
8893
8894 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
8895 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
8896
8897 size_t k( kbegin );
8898
8899 if( k < kpos )
8900 {
8901 SIMDType a1( A.load(i ,k) );
8902 SIMDType a2( A.load(i+1UL,k) );
8903 SIMDType b1( B.load(k,j ) );
8904 SIMDType b2( B.load(k,j+1UL) );
8905 SIMDType xmm1( a1 * b1 );
8906 SIMDType xmm2( a1 * b2 );
8907 SIMDType xmm3( a2 * b1 );
8908 SIMDType xmm4( a2 * b2 );
8909
8910 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
8911 a1 = A.load(i ,k);
8912 a2 = A.load(i+1UL,k);
8913 b1 = B.load(k,j );
8914 b2 = B.load(k,j+1UL);
8915 xmm1 += a1 * b1;
8916 xmm2 += a1 * b2;
8917 xmm3 += a2 * b1;
8918 xmm4 += a2 * b2;
8919 }
8920
8921 C(i ,j ) = sum( xmm1 ) * scalar;
8922 C(i ,j+1UL) = sum( xmm2 ) * scalar;
8923 C(i+1UL,j ) = sum( xmm3 ) * scalar;
8924 C(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
8925
8926 for( ; remainder && k<kend; ++k ) {
8927 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
8928 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
8929 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
8930 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
8931 }
8932 }
8933 else if( k < kend )
8934 {
8935 ElementType value1( A(i ,k) * B(k,j ) );
8936 ElementType value2( A(i ,k) * B(k,j+1UL) );
8937 ElementType value3( A(i+1UL,k) * B(k,j ) );
8938 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
8939
8940 for( ++k; k<kend; ++k ) {
8941 value1 += A(i ,k) * B(k,j );
8942 value2 += A(i ,k) * B(k,j+1UL);
8943 value3 += A(i+1UL,k) * B(k,j );
8944 value4 += A(i+1UL,k) * B(k,j+1UL);
8945 }
8946
8947 C(i ,j ) = value1 * scalar;
8948 C(i ,j+1UL) = value2 * scalar;
8949 C(i+1UL,j ) = value3 * scalar;
8950 C(i+1UL,j+1UL) = value4 * scalar;
8951 }
8952 else
8953 {
8954 reset( C(i ,j ) );
8955 reset( C(i ,j+1UL) );
8956 reset( C(i+1UL,j ) );
8957 reset( C(i+1UL,j+1UL) );
8958 }
8959 }
8960
8961 if( j < jend )
8962 {
8963 const size_t kbegin( ( IsUpper_v<MT4> )
8964 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
8965 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
8966 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
8967
8968 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
8969 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
8970
8971 size_t k( kbegin );
8972
8973 if( k < kpos )
8974 {
8975 SIMDType b1( B.load(k,j) );
8976 SIMDType xmm1( A.load(i ,k) * b1 );
8977 SIMDType xmm2( A.load(i+1UL,k) * b1 );
8978
8979 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
8980 b1 = B.load(k,j);
8981 xmm1 += A.load(i ,k) * b1;
8982 xmm2 += A.load(i+1UL,k) * b1;
8983 }
8984
8985 C(i ,j) = sum( xmm1 ) * scalar;
8986 C(i+1UL,j) = sum( xmm2 ) * scalar;
8987
8988 for( ; remainder && k<kend; ++k ) {
8989 C(i ,j) += A(i ,k) * B(k,j) * scalar;
8990 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
8991 }
8992 }
8993 else if( k < kend )
8994 {
8995 ElementType value1( A(i ,k) * B(k,j) );
8996 ElementType value2( A(i+1UL,k) * B(k,j) );
8997
8998 for( ++k; k<kend; ++k ) {
8999 value1 += A(i ,k) * B(k,j);
9000 value2 += A(i+1UL,k) * B(k,j);
9001 }
9002
9003 C(i ,j) = value1 * scalar;
9004 C(i+1UL,j) = value2 * scalar;
9005 }
9006 else
9007 {
9008 reset( C(i ,j) );
9009 reset( C(i+1UL,j) );
9010 }
9011
9012 if( LOW ) ++j;
9013 }
9014
9015 if( LOW ) {
9016 for( ; j<N; ++j ) {
9017 reset( C(i ,j) );
9018 reset( C(i+1UL,j) );
9019 }
9020 }
9021 }
9022
9023 for( ; i<M; ++i )
9024 {
9025 const size_t jend( LOW ? i+1UL : N );
9026 size_t j( 0UL );
9027
9028 if( SYM || HERM ) {
9029 for( ; j<i; ++j ) {
9030 C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
9031 }
9032 }
9033 else if( UPP ) {
9034 for( ; j<i; ++j ) {
9035 reset( C(i,j) );
9036 }
9037 }
9038
9039 for( ; (j+2UL) <= jend; j+=2UL )
9040 {
9041 const size_t kbegin( ( IsUpper_v<MT4> )
9042 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
9043 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
9044 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
9045
9046 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
9047 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
9048
9049 size_t k( kbegin );
9050
9051 if( k < kpos )
9052 {
9053 SIMDType a1( A.load(i,k) );
9054 SIMDType xmm1( a1 * B.load(k,j ) );
9055 SIMDType xmm2( a1 * B.load(k,j+1UL) );
9056
9057 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
9058 a1 = A.load(i,k);
9059 xmm1 += a1 * B.load(k,j );
9060 xmm2 += a1 * B.load(k,j+1UL);
9061 }
9062
9063 C(i,j ) = sum( xmm1 ) * scalar;
9064 C(i,j+1UL) = sum( xmm2 ) * scalar;
9065
9066 for( ; remainder && k<kend; ++k ) {
9067 C(i,j ) += A(i,k) * B(k,j ) * scalar;
9068 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
9069 }
9070 }
9071 else if( k < kend )
9072 {
9073 ElementType value1( A(i,k) * B(k,j ) );
9074 ElementType value2( A(i,k) * B(k,j+1UL) );
9075
9076 for( ++k; k<kend; ++k ) {
9077 value1 += A(i,k) * B(k,j );
9078 value2 += A(i,k) * B(k,j+1UL);
9079 }
9080
9081 C(i,j ) = value1 * scalar;
9082 C(i,j+1UL) = value2 * scalar;
9083 }
9084 else
9085 {
9086 reset( C(i,j ) );
9087 reset( C(i,j+1UL) );
9088 }
9089 }
9090
9091 if( j < jend )
9092 {
9093 const size_t kbegin( ( IsUpper_v<MT4> )
9094 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
9095 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
9096
9097 const size_t kpos( remainder ? prevMultiple( K, SIMDSIZE ) : K );
9098 BLAZE_INTERNAL_ASSERT( kpos <= K, "Invalid end calculation" );
9099
9100 size_t k( kbegin );
9101
9102 if( k < kpos )
9103 {
9104 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
9105
9106 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
9107 xmm1 += A.load(i,k) * B.load(k,j);
9108 }
9109
9110 C(i,j) = sum( xmm1 ) * scalar;
9111
9112 for( ; remainder && k<K; ++k ) {
9113 C(i,j) += A(i,k) * B(k,j) * scalar;
9114 }
9115 }
9116 else if( k < K )
9117 {
9118 ElementType value( A(i,k) * B(k,j) );
9119
9120 for( ++k; k<K; ++k ) {
9121 value += A(i,k) * B(k,j);
9122 }
9123
9124 C(i,j) = value * scalar;
9125 }
9126 else
9127 {
9128 reset( C(i,j) );
9129 }
9130
9131 if( LOW ) ++j;
9132 }
9133
9134 if( LOW ) {
9135 for( ; j<N; ++j ) {
9136 reset( C(i,j) );
9137 }
9138 }
9139 }
9140 }
9141 //**********************************************************************************************
9142
9143 //**Default assignment to dense matrices (large matrices)***************************************
9157 template< typename MT3 // Type of the left-hand side target matrix
9158 , typename MT4 // Type of the left-hand side matrix operand
9159 , typename MT5 // Type of the right-hand side matrix operand
9160 , typename ST2 > // Type of the scalar value
9161 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9162 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9163 {
9164 selectDefaultAssignKernel( C, A, B, scalar );
9165 }
9166 //**********************************************************************************************
9167
9168 //**Vectorized default assignment to dense matrices (large matrices)****************************
9183 template< typename MT3 // Type of the left-hand side target matrix
9184 , typename MT4 // Type of the left-hand side matrix operand
9185 , typename MT5 // Type of the right-hand side matrix operand
9186 , typename ST2 > // Type of the scalar value
9187 static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9188 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9189 {
9190 if( SYM )
9191 smmm( C, A, B, scalar );
9192 else if( HERM )
9193 hmmm( C, A, B, scalar );
9194 else if( LOW )
9195 lmmm( C, A, B, scalar, ST2(0) );
9196 else if( UPP )
9197 ummm( C, A, B, scalar, ST2(0) );
9198 else
9199 mmm( C, A, B, scalar, ST2(0) );
9200 }
9201 //**********************************************************************************************
9202
9203 //**BLAS-based assignment to dense matrices (default)*******************************************
9217 template< typename MT3 // Type of the left-hand side target matrix
9218 , typename MT4 // Type of the left-hand side matrix operand
9219 , typename MT5 // Type of the right-hand side matrix operand
9220 , typename ST2 > // Type of the scalar value
9221 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9222 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9223 {
9224 selectLargeAssignKernel( C, A, B, scalar );
9225 }
9226 //**********************************************************************************************
9227
9228 //**BLAS-based assignment to dense matrices*****************************************************
9229#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
9243 template< typename MT3 // Type of the left-hand side target matrix
9244 , typename MT4 // Type of the left-hand side matrix operand
9245 , typename MT5 // Type of the right-hand side matrix operand
9246 , typename ST2 > // Type of the scalar value
9247 static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9248 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9249 {
9250 using ET = ElementType_t<MT3>;
9251
9252 if( IsTriangular_v<MT4> ) {
9253 assign( C, B );
9254 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
9255 }
9256 else if( IsTriangular_v<MT5> ) {
9257 assign( C, A );
9258 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
9259 }
9260 else {
9261 gemm( C, A, B, ET(scalar), ET(0) );
9262 }
9263 }
9264#endif
9265 //**********************************************************************************************
9266
9267 //**Assignment to sparse matrices***************************************************************
9279 template< typename MT // Type of the target sparse matrix
9280 , bool SO > // Storage order of the target sparse matrix
9281 friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9282 {
9284
9285 using TmpType = If_t< SO, OppositeType, ResultType >;
9286
9293
9294 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9295 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9296
9297 const ForwardFunctor fwd;
9298
9299 const TmpType tmp( serial( rhs ) );
9300 assign( *lhs, fwd( tmp ) );
9301 }
9302 //**********************************************************************************************
9303
9304 //**Addition assignment to dense matrices*******************************************************
9316 template< typename MT // Type of the target dense matrix
9317 , bool SO > // Storage order of the target dense matrix
9318 friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9319 {
9321
9322 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
9323 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
9324
9325 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9326 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9327
9328 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
9329 return;
9330 }
9331
9332 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
9333 RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
9334
9335 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9336 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9337 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9338 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9339 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
9340 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
9341
9342 DMatScalarMultExpr::selectAddAssignKernel( *lhs, A, B, rhs.scalar_ );
9343 }
9344 //**********************************************************************************************
9345
9346 //**Addition assignment to dense matrices (kernel selection)************************************
9357 template< typename MT3 // Type of the left-hand side target matrix
9358 , typename MT4 // Type of the left-hand side matrix operand
9359 , typename MT5 // Type of the right-hand side matrix operand
9360 , typename ST2 > // Type of the scalar value
9361 static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9362 {
9363 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
9364 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
9365 selectSmallAddAssignKernel( C, A, B, scalar );
9366 else
9367 selectBlasAddAssignKernel( C, A, B, scalar );
9368 }
9369 //**********************************************************************************************
9370
9371 //**Default addition assignment to dense matrices (general/general)*****************************
9385 template< typename MT3 // Type of the left-hand side target matrix
9386 , typename MT4 // Type of the left-hand side matrix operand
9387 , typename MT5 // Type of the right-hand side matrix operand
9388 , typename ST2 > // Type of the scalar value
9389 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9390 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9391 {
9392 const ResultType tmp( serial( A * B * scalar ) );
9393 addAssign( C, tmp );
9394 }
9395 //**********************************************************************************************
9396
9397 //**Default addition assignment to row-major dense matrices (general/diagonal)******************
9411 template< typename MT3 // Type of the left-hand side target matrix
9412 , typename MT4 // Type of the left-hand side matrix operand
9413 , typename MT5 // Type of the right-hand side matrix operand
9414 , typename ST2 > // Type of the scalar value
9415 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9416 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9417 {
9418 const size_t M( A.rows() );
9419 const size_t N( B.columns() );
9420
9421 for( size_t i=0UL; i<M; ++i )
9422 {
9423 const size_t jbegin( ( IsUpper_v<MT4> )
9424 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
9425 :( 0UL ) );
9426 const size_t jend( ( IsLower_v<MT4> )
9427 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
9428 :( N ) );
9429 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
9430
9431 const size_t jnum( jend - jbegin );
9432 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
9433 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
9434
9435 for( size_t j=jbegin; j<jpos; j+=2UL ) {
9436 C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
9437 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
9438 }
9439 if( jpos < jend ) {
9440 C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
9441 }
9442 }
9443 }
9444 //**********************************************************************************************
9445
9446 //**Default addition assignment to column-major dense matrices (general/diagonal)***************
9460 template< typename MT3 // Type of the left-hand side target matrix
9461 , typename MT4 // Type of the left-hand side matrix operand
9462 , typename MT5 // Type of the right-hand side matrix operand
9463 , typename ST2 > // Type of the scalar value
9464 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9465 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9466 {
9467 constexpr size_t block( BLOCK_SIZE );
9468
9469 const size_t M( A.rows() );
9470 const size_t N( B.columns() );
9471
9472 for( size_t jj=0UL; jj<N; jj+=block ) {
9473 const size_t jend( min( N, jj+block ) );
9474 for( size_t ii=0UL; ii<M; ii+=block ) {
9475 const size_t iend( min( M, ii+block ) );
9476 for( size_t j=jj; j<jend; ++j )
9477 {
9478 const size_t ibegin( ( IsLower_v<MT4> )
9479 ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
9480 :( ii ) );
9481 const size_t ipos( ( IsUpper_v<MT4> )
9482 ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
9483 :( iend ) );
9484
9485 for( size_t i=ibegin; i<ipos; ++i ) {
9486 C(i,j) += A(i,j) * B(j,j) * scalar;
9487 }
9488 }
9489 }
9490 }
9491 }
9492 //**********************************************************************************************
9493
9494 //**Default addition assignment to row-major dense matrices (diagonal/general)******************
9508 template< typename MT3 // Type of the left-hand side target matrix
9509 , typename MT4 // Type of the left-hand side matrix operand
9510 , typename MT5 // Type of the right-hand side matrix operand
9511 , typename ST2 > // Type of the scalar value
9512 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9513 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9514 {
9515 constexpr size_t block( BLOCK_SIZE );
9516
9517 const size_t M( A.rows() );
9518 const size_t N( B.columns() );
9519
9520 for( size_t ii=0UL; ii<M; ii+=block ) {
9521 const size_t iend( min( M, ii+block ) );
9522 for( size_t jj=0UL; jj<N; jj+=block ) {
9523 const size_t jend( min( N, jj+block ) );
9524 for( size_t i=ii; i<iend; ++i )
9525 {
9526 const size_t jbegin( ( IsUpper_v<MT5> )
9527 ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
9528 :( jj ) );
9529 const size_t jpos( ( IsLower_v<MT5> )
9530 ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
9531 :( jend ) );
9532
9533 for( size_t j=jbegin; j<jpos; ++j ) {
9534 C(i,j) += A(i,i) * B(i,j) * scalar;
9535 }
9536 }
9537 }
9538 }
9539 }
9540 //**********************************************************************************************
9541
9542 //**Default addition assignment to column-major dense matrices (diagonal/general)***************
9556 template< typename MT3 // Type of the left-hand side target matrix
9557 , typename MT4 // Type of the left-hand side matrix operand
9558 , typename MT5 // Type of the right-hand side matrix operand
9559 , typename ST2 > // Type of the scalar value
9560 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9561 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9562 {
9563 const size_t M( A.rows() );
9564 const size_t N( B.columns() );
9565
9566 for( size_t j=0UL; j<N; ++j )
9567 {
9568 const size_t ibegin( ( IsLower_v<MT5> )
9569 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
9570 :( 0UL ) );
9571 const size_t iend( ( IsUpper_v<MT5> )
9572 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
9573 :( M ) );
9574 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
9575
9576 const size_t inum( iend - ibegin );
9577 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
9578 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
9579
9580 for( size_t i=ibegin; i<ipos; i+=2UL ) {
9581 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
9582 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
9583 }
9584 if( ipos < iend ) {
9585 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
9586 }
9587 }
9588 }
9589 //**********************************************************************************************
9590
9591 //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
9605 template< typename MT3 // Type of the left-hand side target matrix
9606 , typename MT4 // Type of the left-hand side matrix operand
9607 , typename MT5 // Type of the right-hand side matrix operand
9608 , typename ST2 > // Type of the scalar value
9609 static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9610 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9611 {
9612 for( size_t i=0UL; i<A.rows(); ++i ) {
9613 C(i,i) += A(i,i) * B(i,i) * scalar;
9614 }
9615 }
9616 //**********************************************************************************************
9617
9618 //**Default addition assignment to dense matrices (small matrices)******************************
9632 template< typename MT3 // Type of the left-hand side target matrix
9633 , typename MT4 // Type of the left-hand side matrix operand
9634 , typename MT5 // Type of the right-hand side matrix operand
9635 , typename ST2 > // Type of the scalar value
9636 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9637 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9638 {
9639 selectDefaultAddAssignKernel( C, A, B, scalar );
9640 }
9641 //**********************************************************************************************
9642
9643 //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
9658 template< typename MT3 // Type of the left-hand side target matrix
9659 , typename MT4 // Type of the left-hand side matrix operand
9660 , typename MT5 // Type of the right-hand side matrix operand
9661 , typename ST2 > // Type of the scalar value
9662 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9663 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9664 {
9665 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
9666
9667 const size_t M( A.rows() );
9668 const size_t N( B.columns() );
9669 const size_t K( A.columns() );
9670
9671 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
9672
9673 size_t i( 0UL );
9674
9675 for( ; !( LOW && UPP ) && (i+3UL) <= M; i+=3UL )
9676 {
9677 const size_t jend( LOW ? i+3UL : N );
9678 size_t j( UPP ? i : 0UL );
9679
9680 for( ; (j+3UL) <= jend; j+=3UL )
9681 {
9682 const size_t kbegin( ( IsUpper_v<MT4> )
9683 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
9684 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
9685 const size_t kend( ( IsLower_v<MT4> )
9686 ?( IsUpper_v<MT5> ? min( i+3UL, j+3UL ) : ( i+3UL ) )
9687 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
9688
9689 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
9690 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
9691
9692 size_t k( kbegin );
9693
9694 if( k < kpos )
9695 {
9696 SIMDType a1( A.load(i ,k) );
9697 SIMDType a2( A.load(i+1UL,k) );
9698 SIMDType a3( A.load(i+2UL,k) );
9699 SIMDType b1( B.load(k,j ) );
9700 SIMDType b2( B.load(k,j+1UL) );
9701 SIMDType b3( B.load(k,j+2UL) );
9702 SIMDType xmm1( a1 * b1 );
9703 SIMDType xmm2( a1 * b2 );
9704 SIMDType xmm3( a1 * b3 );
9705 SIMDType xmm4( a2 * b1 );
9706 SIMDType xmm5( a2 * b2 );
9707 SIMDType xmm6( a2 * b3 );
9708 SIMDType xmm7( a3 * b1 );
9709 SIMDType xmm8( a3 * b2 );
9710 SIMDType xmm9( a3 * b3 );
9711
9712 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
9713 a1 = A.load(i ,k);
9714 a2 = A.load(i+1UL,k);
9715 a3 = A.load(i+2UL,k);
9716 b1 = B.load(k,j );
9717 b2 = B.load(k,j+1UL);
9718 b3 = B.load(k,j+2UL);
9719 xmm1 += a1 * b1;
9720 xmm2 += a1 * b2;
9721 xmm3 += a1 * b3;
9722 xmm4 += a2 * b1;
9723 xmm5 += a2 * b2;
9724 xmm6 += a2 * b3;
9725 xmm7 += a3 * b1;
9726 xmm8 += a3 * b2;
9727 xmm9 += a3 * b3;
9728 }
9729
9730 C(i ,j ) += sum( xmm1 ) * scalar;
9731 C(i ,j+1UL) += sum( xmm2 ) * scalar;
9732 C(i ,j+2UL) += sum( xmm3 ) * scalar;
9733 C(i+1UL,j ) += sum( xmm4 ) * scalar;
9734 C(i+1UL,j+1UL) += sum( xmm5 ) * scalar;
9735 C(i+1UL,j+2UL) += sum( xmm6 ) * scalar;
9736 C(i+2UL,j ) += sum( xmm7 ) * scalar;
9737 C(i+2UL,j+1UL) += sum( xmm8 ) * scalar;
9738 C(i+2UL,j+2UL) += sum( xmm9 ) * scalar;
9739
9740 for( ; remainder && k<kend; ++k ) {
9741 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
9742 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
9743 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
9744 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
9745 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
9746 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
9747 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
9748 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
9749 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL) * scalar;
9750 }
9751 }
9752 else if( k < kend )
9753 {
9754 ElementType value1( A(i ,k) * B(k,j ) );
9755 ElementType value2( A(i ,k) * B(k,j+1UL) );
9756 ElementType value3( A(i ,k) * B(k,j+2UL) );
9757 ElementType value4( A(i+1UL,k) * B(k,j ) );
9758 ElementType value5( A(i+1UL,k) * B(k,j+1UL) );
9759 ElementType value6( A(i+1UL,k) * B(k,j+2UL) );
9760 ElementType value7( A(i+2UL,k) * B(k,j ) );
9761 ElementType value8( A(i+2UL,k) * B(k,j+1UL) );
9762 ElementType value9( A(i+2UL,k) * B(k,j+2UL) );
9763
9764 for( ++k; k<kend; ++k ) {
9765 value1 += A(i ,k) * B(k,j );
9766 value2 += A(i ,k) * B(k,j+1UL);
9767 value3 += A(i ,k) * B(k,j+2UL);
9768 value4 += A(i+1UL,k) * B(k,j );
9769 value5 += A(i+1UL,k) * B(k,j+1UL);
9770 value6 += A(i+1UL,k) * B(k,j+2UL);
9771 value7 += A(i+2UL,k) * B(k,j );
9772 value8 += A(i+2UL,k) * B(k,j+1UL);
9773 value9 += A(i+2UL,k) * B(k,j+2UL);
9774 }
9775
9776 C(i ,j ) += value1 * scalar;
9777 C(i ,j+1UL) += value2 * scalar;
9778 C(i ,j+2UL) += value3 * scalar;
9779 C(i+1UL,j ) += value4 * scalar;
9780 C(i+1UL,j+1UL) += value5 * scalar;
9781 C(i+1UL,j+2UL) += value6 * scalar;
9782 C(i+2UL,j ) += value7 * scalar;
9783 C(i+2UL,j+1UL) += value8 * scalar;
9784 C(i+2UL,j+2UL) += value9 * scalar;
9785 }
9786 }
9787
9788 for( ; (j+2UL) <= jend; j+=2UL )
9789 {
9790 const size_t kbegin( ( IsUpper_v<MT4> )
9791 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
9792 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
9793 const size_t kend( ( IsLower_v<MT4> )
9794 ?( IsUpper_v<MT5> ? min( i+3UL, j+2UL ) : ( i+3UL ) )
9795 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
9796
9797 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
9798 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
9799
9800 size_t k( kbegin );
9801
9802 if( k < kpos )
9803 {
9804 SIMDType a1( A.load(i ,k) );
9805 SIMDType a2( A.load(i+1UL,k) );
9806 SIMDType a3( A.load(i+2UL,k) );
9807 SIMDType b1( B.load(k,j ) );
9808 SIMDType b2( B.load(k,j+1UL) );
9809 SIMDType xmm1( a1 * b1 );
9810 SIMDType xmm2( a1 * b2 );
9811 SIMDType xmm3( a2 * b1 );
9812 SIMDType xmm4( a2 * b2 );
9813 SIMDType xmm5( a3 * b1 );
9814 SIMDType xmm6( a3 * b2 );
9815
9816 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
9817 a1 = A.load(i ,k);
9818 a2 = A.load(i+1UL,k);
9819 a3 = A.load(i+2UL,k);
9820 b1 = B.load(k,j );
9821 b2 = B.load(k,j+1UL);
9822 xmm1 += a1 * b1;
9823 xmm2 += a1 * b2;
9824 xmm3 += a2 * b1;
9825 xmm4 += a2 * b2;
9826 xmm5 += a3 * b1;
9827 xmm6 += a3 * b2;
9828 }
9829
9830 C(i ,j ) += sum( xmm1 ) * scalar;
9831 C(i ,j+1UL) += sum( xmm2 ) * scalar;
9832 C(i+1UL,j ) += sum( xmm3 ) * scalar;
9833 C(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
9834 C(i+2UL,j ) += sum( xmm5 ) * scalar;
9835 C(i+2UL,j+1UL) += sum( xmm6 ) * scalar;
9836
9837 for( ; remainder && k<kend; ++k ) {
9838 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
9839 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
9840 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
9841 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
9842 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
9843 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
9844 }
9845 }
9846 else if( k < kend )
9847 {
9848 ElementType value1( A(i ,k) * B(k,j ) );
9849 ElementType value2( A(i ,k) * B(k,j+1UL) );
9850 ElementType value3( A(i+1UL,k) * B(k,j ) );
9851 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
9852 ElementType value5( A(i+2UL,k) * B(k,j ) );
9853 ElementType value6( A(i+2UL,k) * B(k,j+1UL) );
9854
9855 for( ++k; k<kend; ++k ) {
9856 value1 += A(i ,k) * B(k,j );
9857 value2 += A(i ,k) * B(k,j+1UL);
9858 value3 += A(i+1UL,k) * B(k,j );
9859 value4 += A(i+1UL,k) * B(k,j+1UL);
9860 value5 += A(i+2UL,k) * B(k,j );
9861 value6 += A(i+2UL,k) * B(k,j+1UL);
9862 }
9863
9864 C(i ,j ) += value1 * scalar;
9865 C(i ,j+1UL) += value2 * scalar;
9866 C(i+1UL,j ) += value3 * scalar;
9867 C(i+1UL,j+1UL) += value4 * scalar;
9868 C(i+2UL,j ) += value5 * scalar;
9869 C(i+2UL,j+1UL) += value6 * scalar;
9870 }
9871 }
9872
9873 if( j < jend )
9874 {
9875 const size_t kbegin( ( IsUpper_v<MT4> )
9876 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
9877 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
9878 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
9879
9880 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
9881 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
9882
9883 size_t k( kbegin );
9884
9885 if( k < kpos )
9886 {
9887 SIMDType b1( B.load(k,j) );
9888 SIMDType xmm1( A.load(i ,k) * b1 );
9889 SIMDType xmm2( A.load(i+1UL,k) * b1 );
9890 SIMDType xmm3( A.load(i+2UL,k) * b1 );
9891
9892 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
9893 b1 = B.load(k,j);
9894 xmm1 += A.load(i ,k) * b1;
9895 xmm2 += A.load(i+1UL,k) * b1;
9896 xmm3 += A.load(i+2UL,k) * b1;
9897 }
9898
9899 C(i ,j) += sum( xmm1 ) * scalar;
9900 C(i+1UL,j) += sum( xmm2 ) * scalar;
9901 C(i+2UL,j) += sum( xmm3 ) * scalar;
9902
9903 for( ; remainder && k<kend; ++k ) {
9904 C(i ,j) += A(i ,k) * B(k,j) * scalar;
9905 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
9906 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
9907 }
9908 }
9909 else if( k < kend )
9910 {
9911 ElementType value1( A(i ,k) * B(k,j) );
9912 ElementType value2( A(i+1UL,k) * B(k,j) );
9913 ElementType value3( A(i+2UL,k) * B(k,j) );
9914
9915 for( ++k; k<kend; ++k ) {
9916 value1 += A(i ,k) * B(k,j);
9917 value2 += A(i+1UL,k) * B(k,j);
9918 value3 += A(i+2UL,k) * B(k,j);
9919 }
9920
9921 C(i ,j) += value1 * scalar;
9922 C(i+1UL,j) += value2 * scalar;
9923 C(i+2UL,j) += value3 * scalar;
9924 }
9925 }
9926 }
9927
9928 for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
9929 {
9930 const size_t jend( LOW ? i+2UL : N );
9931 size_t j( UPP ? i : 0UL );
9932
9933 for( ; (j+4UL) <= jend; j+=4UL )
9934 {
9935 const size_t kbegin( ( IsUpper_v<MT4> )
9936 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
9937 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
9938 const size_t kend( ( IsLower_v<MT4> )
9939 ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
9940 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
9941
9942 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
9943 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
9944
9945 size_t k( kbegin );
9946
9947 if( k < kpos )
9948 {
9949 SIMDType a1( A.load(i ,k) );
9950 SIMDType a2( A.load(i+1UL,k) );
9951 SIMDType b1( B.load(k,j ) );
9952 SIMDType b2( B.load(k,j+1UL) );
9953 SIMDType b3( B.load(k,j+2UL) );
9954 SIMDType b4( B.load(k,j+3UL) );
9955 SIMDType xmm1( a1 * b1 );
9956 SIMDType xmm2( a1 * b2 );
9957 SIMDType xmm3( a1 * b3 );
9958 SIMDType xmm4( a1 * b4 );
9959 SIMDType xmm5( a2 * b1 );
9960 SIMDType xmm6( a2 * b2 );
9961 SIMDType xmm7( a2 * b3 );
9962 SIMDType xmm8( a2 * b4 );
9963
9964 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
9965 a1 = A.load(i ,k);
9966 a2 = A.load(i+1UL,k);
9967 b1 = B.load(k,j );
9968 b2 = B.load(k,j+1UL);
9969 b3 = B.load(k,j+2UL);
9970 b4 = B.load(k,j+3UL);
9971 xmm1 += a1 * b1;
9972 xmm2 += a1 * b2;
9973 xmm3 += a1 * b3;
9974 xmm4 += a1 * b4;
9975 xmm5 += a2 * b1;
9976 xmm6 += a2 * b2;
9977 xmm7 += a2 * b3;
9978 xmm8 += a2 * b4;
9979 }
9980
9981 C(i ,j ) += sum( xmm1 ) * scalar;
9982 C(i ,j+1UL) += sum( xmm2 ) * scalar;
9983 C(i ,j+2UL) += sum( xmm3 ) * scalar;
9984 C(i ,j+3UL) += sum( xmm4 ) * scalar;
9985 C(i+1UL,j ) += sum( xmm5 ) * scalar;
9986 C(i+1UL,j+1UL) += sum( xmm6 ) * scalar;
9987 C(i+1UL,j+2UL) += sum( xmm7 ) * scalar;
9988 C(i+1UL,j+3UL) += sum( xmm8 ) * scalar;
9989
9990 for( ; remainder && k<kend; ++k ) {
9991 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
9992 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
9993 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
9994 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
9995 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
9996 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
9997 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
9998 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
9999 }
10000 }
10001 else if( k < kend )
10002 {
10003 ElementType value1( A(i ,k) * B(k,j ) );
10004 ElementType value2( A(i ,k) * B(k,j+1UL) );
10005 ElementType value3( A(i ,k) * B(k,j+2UL) );
10006 ElementType value4( A(i ,k) * B(k,j+3UL) );
10007 ElementType value5( A(i+1UL,k) * B(k,j ) );
10008 ElementType value6( A(i+1UL,k) * B(k,j+1UL) );
10009 ElementType value7( A(i+1UL,k) * B(k,j+2UL) );
10010 ElementType value8( A(i+1UL,k) * B(k,j+3UL) );
10011
10012 for( ++k; k<kend; ++k ) {
10013 value1 += A(i ,k) * B(k,j );
10014 value2 += A(i ,k) * B(k,j+1UL);
10015 value3 += A(i ,k) * B(k,j+2UL);
10016 value4 += A(i ,k) * B(k,j+3UL);
10017 value5 += A(i+1UL,k) * B(k,j );
10018 value6 += A(i+1UL,k) * B(k,j+1UL);
10019 value7 += A(i+1UL,k) * B(k,j+2UL);
10020 value8 += A(i+1UL,k) * B(k,j+3UL);
10021 }
10022
10023 C(i ,j ) += value1 * scalar;
10024 C(i ,j+1UL) += value2 * scalar;
10025 C(i ,j+2UL) += value3 * scalar;
10026 C(i ,j+3UL) += value4 * scalar;
10027 C(i+1UL,j ) += value5 * scalar;
10028 C(i+1UL,j+1UL) += value6 * scalar;
10029 C(i+1UL,j+2UL) += value7 * scalar;
10030 C(i+1UL,j+3UL) += value8 * scalar;
10031 }
10032 }
10033
10034 for( ; (j+2UL) <= jend; j+=2UL )
10035 {
10036 const size_t kbegin( ( IsUpper_v<MT4> )
10037 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10038 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10039 const size_t kend( ( IsLower_v<MT4> )
10040 ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
10041 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
10042
10043 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
10044 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
10045
10046 size_t k( kbegin );
10047
10048 if( k < kpos )
10049 {
10050 SIMDType a1( A.load(i ,k) );
10051 SIMDType a2( A.load(i+1UL,k) );
10052 SIMDType b1( B.load(k,j ) );
10053 SIMDType b2( B.load(k,j+1UL) );
10054 SIMDType xmm1( a1 * b1 );
10055 SIMDType xmm2( a1 * b2 );
10056 SIMDType xmm3( a2 * b1 );
10057 SIMDType xmm4( a2 * b2 );
10058
10059 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10060 a1 = A.load(i ,k);
10061 a2 = A.load(i+1UL,k);
10062 b1 = B.load(k,j );
10063 b2 = B.load(k,j+1UL);
10064 xmm1 += a1 * b1;
10065 xmm2 += a1 * b2;
10066 xmm3 += a2 * b1;
10067 xmm4 += a2 * b2;
10068 }
10069
10070 C(i ,j ) += sum( xmm1 ) * scalar;
10071 C(i ,j+1UL) += sum( xmm2 ) * scalar;
10072 C(i+1UL,j ) += sum( xmm3 ) * scalar;
10073 C(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
10074
10075 for( ; remainder && k<kend; ++k ) {
10076 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
10077 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
10078 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
10079 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
10080 }
10081 }
10082 else if( k < kend )
10083 {
10084 ElementType value1( A(i ,k) * B(k,j ) );
10085 ElementType value2( A(i ,k) * B(k,j+1UL) );
10086 ElementType value3( A(i+1UL,k) * B(k,j ) );
10087 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
10088
10089 for( ++k; k<kend; ++k ) {
10090 value1 += A(i ,k) * B(k,j );
10091 value2 += A(i ,k) * B(k,j+1UL);
10092 value3 += A(i+1UL,k) * B(k,j );
10093 value4 += A(i+1UL,k) * B(k,j+1UL);
10094 }
10095
10096 C(i ,j ) += value1 * scalar;
10097 C(i ,j+1UL) += value2 * scalar;
10098 C(i+1UL,j ) += value3 * scalar;
10099 C(i+1UL,j+1UL) += value4 * scalar;
10100 }
10101 }
10102
10103 if( j < jend )
10104 {
10105 const size_t kbegin( ( IsUpper_v<MT4> )
10106 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10107 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10108 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
10109
10110 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
10111 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
10112
10113 size_t k( kbegin );
10114
10115 if( k < kpos )
10116 {
10117 SIMDType b1( B.load(k,j) );
10118 SIMDType xmm1( A.load(i ,k) * b1 );
10119 SIMDType xmm2( A.load(i+1UL,k) * b1 );
10120
10121 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10122 b1 = B.load(k,j);
10123 xmm1 += A.load(i ,k) * b1;
10124 xmm2 += A.load(i+1UL,k) * b1;
10125 }
10126
10127 C(i ,j) += sum( xmm1 ) * scalar;
10128 C(i+1UL,j) += sum( xmm2 ) * scalar;
10129
10130 for( ; remainder && k<kend; ++k ) {
10131 C(i ,j) += A(i ,k) * B(k,j) * scalar;
10132 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
10133 }
10134 }
10135 else if( k < kend )
10136 {
10137 ElementType value1( A(i ,k) * B(k,j) );
10138 ElementType value2( A(i+1UL,k) * B(k,j) );
10139
10140 for( ++k; k<kend; ++k ) {
10141 value1 += A(i ,k) * B(k,j);
10142 value2 += A(i+1UL,k) * B(k,j);
10143 }
10144
10145 C(i ,j) += value1 * scalar;
10146 C(i+1UL,j) += value2 * scalar;
10147 }
10148 }
10149 }
10150
10151 for( ; i<M; ++i )
10152 {
10153 const size_t jend( LOW ? i+1UL : N );
10154 size_t j( UPP ? i : 0UL );
10155
10156 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
10157 {
10158 const size_t kbegin( ( IsUpper_v<MT4> )
10159 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10160 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10161 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
10162
10163 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
10164 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
10165
10166 size_t k( kbegin );
10167
10168 if( k < kpos )
10169 {
10170 SIMDType a1( A.load(i,k) );
10171 SIMDType xmm1( a1 * B.load(k,j ) );
10172 SIMDType xmm2( a1 * B.load(k,j+1UL) );
10173 SIMDType xmm3( a1 * B.load(k,j+2UL) );
10174 SIMDType xmm4( a1 * B.load(k,j+3UL) );
10175
10176 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10177 a1 = A.load(i,k);
10178 xmm1 += a1 * B.load(k,j );
10179 xmm2 += a1 * B.load(k,j+1UL);
10180 xmm3 += a1 * B.load(k,j+2UL);
10181 xmm4 += a1 * B.load(k,j+3UL);
10182 }
10183
10184 C(i,j ) += sum( xmm1 ) * scalar;
10185 C(i,j+1UL) += sum( xmm2 ) * scalar;
10186 C(i,j+2UL) += sum( xmm3 ) * scalar;
10187 C(i,j+3UL) += sum( xmm4 ) * scalar;
10188
10189 for( ; remainder && k<kend; ++k ) {
10190 C(i,j ) += A(i,k) * B(k,j ) * scalar;
10191 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
10192 C(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
10193 C(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
10194 }
10195 }
10196 else if( k < kend )
10197 {
10198 ElementType value1( A(i,k) * B(k,j ) );
10199 ElementType value2( A(i,k) * B(k,j+1UL) );
10200 ElementType value3( A(i,k) * B(k,j+2UL) );
10201 ElementType value4( A(i,k) * B(k,j+3UL) );
10202
10203 for( ++k; k<kend; ++k ) {
10204 value1 += A(i,k) * B(k,j );
10205 value2 += A(i,k) * B(k,j+1UL);
10206 value3 += A(i,k) * B(k,j+2UL);
10207 value4 += A(i,k) * B(k,j+3UL);
10208 }
10209
10210 C(i,j ) += value1 * scalar;
10211 C(i,j+1UL) += value2 * scalar;
10212 C(i,j+2UL) += value3 * scalar;
10213 C(i,j+3UL) += value4 * scalar;
10214 }
10215 }
10216
10217 for( ; (j+2UL) <= jend; j+=2UL )
10218 {
10219 const size_t kbegin( ( IsUpper_v<MT4> )
10220 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10221 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10222 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
10223
10224 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
10225 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
10226
10227 size_t k( kbegin );
10228
10229 if( k < kpos )
10230 {
10231 SIMDType a1( A.load(i,k) );
10232 SIMDType xmm1( a1 * B.load(k,j ) );
10233 SIMDType xmm2( a1 * B.load(k,j+1UL) );
10234
10235 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10236 a1 = A.load(i,k);
10237 xmm1 += a1 * B.load(k,j );
10238 xmm2 += a1 * B.load(k,j+1UL);
10239 }
10240
10241 C(i,j ) += sum( xmm1 ) * scalar;
10242 C(i,j+1UL) += sum( xmm2 ) * scalar;
10243
10244 for( ; remainder && k<kend; ++k ) {
10245 C(i,j ) += A(i,k) * B(k,j ) * scalar;
10246 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
10247 }
10248 }
10249 else if( k < kend )
10250 {
10251 ElementType value1( A(i,k) * B(k,j ) );
10252 ElementType value2( A(i,k) * B(k,j+1UL) );
10253
10254 for( ++k; k<kend; ++k ) {
10255 value1 += A(i,k) * B(k,j );
10256 value2 += A(i,k) * B(k,j+1UL);
10257 }
10258
10259 C(i,j ) += value1 * scalar;
10260 C(i,j+1UL) += value2 * scalar;
10261 }
10262 }
10263
10264 if( j < jend )
10265 {
10266 const size_t kbegin( ( IsUpper_v<MT4> )
10267 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10268 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10269
10270 const size_t kpos( remainder ? prevMultiple( K, SIMDSIZE ) : K );
10271 BLAZE_INTERNAL_ASSERT( kpos <= K, "Invalid end calculation" );
10272
10273 size_t k( kbegin );
10274
10275 if( k < kpos )
10276 {
10277 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
10278
10279 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10280 xmm1 += A.load(i,k) * B.load(k,j);
10281 }
10282
10283 C(i,j) += sum( xmm1 ) * scalar;
10284
10285 for( ; remainder && k<K; ++k ) {
10286 C(i,j) += A(i,k) * B(k,j) * scalar;
10287 }
10288 }
10289 else if( k < K )
10290 {
10291 ElementType value( A(i,k) * B(k,j) );
10292
10293 for( ++k; k<K; ++k ) {
10294 value += A(i,k) * B(k,j);
10295 }
10296
10297 C(i,j) += value * scalar;
10298 }
10299 }
10300 }
10301 }
10302 //**********************************************************************************************
10303
10304 //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
10319 template< typename MT3 // Type of the left-hand side target matrix
10320 , typename MT4 // Type of the left-hand side matrix operand
10321 , typename MT5 // Type of the right-hand side matrix operand
10322 , typename ST2 > // Type of the scalar value
10323 static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10324 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10325 {
10326 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
10327
10328 const size_t M( A.rows() );
10329 const size_t N( B.columns() );
10330 const size_t K( A.columns() );
10331
10332 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
10333
10334 size_t i( 0UL );
10335
10336 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
10337 {
10338 size_t j( 0UL );
10339
10340 for( ; (j+2UL) <= N; j+=2UL )
10341 {
10342 const size_t kbegin( ( IsUpper_v<MT4> )
10343 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10344 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10345 const size_t kend( ( IsLower_v<MT4> )
10346 ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
10347 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
10348
10349 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
10350 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
10351
10352 size_t k( kbegin );
10353
10354 if( k < kpos )
10355 {
10356 SIMDType a1( A.load(i ,k) );
10357 SIMDType a2( A.load(i+1UL,k) );
10358 SIMDType a3( A.load(i+2UL,k) );
10359 SIMDType a4( A.load(i+3UL,k) );
10360 SIMDType b1( B.load(k,j ) );
10361 SIMDType b2( B.load(k,j+1UL) );
10362 SIMDType xmm1( a1 * b1 );
10363 SIMDType xmm2( a1 * b2 );
10364 SIMDType xmm3( a2 * b1 );
10365 SIMDType xmm4( a2 * b2 );
10366 SIMDType xmm5( a3 * b1 );
10367 SIMDType xmm6( a3 * b2 );
10368 SIMDType xmm7( a4 * b1 );
10369 SIMDType xmm8( a4 * b2 );
10370
10371 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10372 a1 = A.load(i ,k);
10373 a2 = A.load(i+1UL,k);
10374 a3 = A.load(i+2UL,k);
10375 a4 = A.load(i+3UL,k);
10376 b1 = B.load(k,j );
10377 b2 = B.load(k,j+1UL);
10378 xmm1 += a1 * b1;
10379 xmm2 += a1 * b2;
10380 xmm3 += a2 * b1;
10381 xmm4 += a2 * b2;
10382 xmm5 += a3 * b1;
10383 xmm6 += a3 * b2;
10384 xmm7 += a4 * b1;
10385 xmm8 += a4 * b2;
10386 }
10387
10388 C(i ,j ) += sum( xmm1 ) * scalar;
10389 C(i ,j+1UL) += sum( xmm2 ) * scalar;
10390 C(i+1UL,j ) += sum( xmm3 ) * scalar;
10391 C(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
10392 C(i+2UL,j ) += sum( xmm5 ) * scalar;
10393 C(i+2UL,j+1UL) += sum( xmm6 ) * scalar;
10394 C(i+3UL,j ) += sum( xmm7 ) * scalar;
10395 C(i+3UL,j+1UL) += sum( xmm8 ) * scalar;
10396
10397 for( ; remainder && k<kend; ++k ) {
10398 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
10399 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
10400 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
10401 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
10402 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
10403 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
10404 C(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
10405 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
10406 }
10407 }
10408 else if( k < kend )
10409 {
10410 ElementType value1( A(i ,k) * B(k,j ) );
10411 ElementType value2( A(i ,k) * B(k,j+1UL) );
10412 ElementType value3( A(i+1UL,k) * B(k,j ) );
10413 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
10414 ElementType value5( A(i+2UL,k) * B(k,j ) );
10415 ElementType value6( A(i+2UL,k) * B(k,j+1UL) );
10416 ElementType value7( A(i+3UL,k) * B(k,j ) );
10417 ElementType value8( A(i+3UL,k) * B(k,j+1UL) );
10418
10419 for( ++k; k<kend; ++k ) {
10420 value1 += A(i ,k) * B(k,j );
10421 value2 += A(i ,k) * B(k,j+1UL);
10422 value3 += A(i+1UL,k) * B(k,j );
10423 value4 += A(i+1UL,k) * B(k,j+1UL);
10424 value5 += A(i+2UL,k) * B(k,j );
10425 value6 += A(i+2UL,k) * B(k,j+1UL);
10426 value7 += A(i+3UL,k) * B(k,j );
10427 value8 += A(i+3UL,k) * B(k,j+1UL);
10428 }
10429
10430 C(i ,j ) += value1 * scalar;
10431 C(i ,j+1UL) += value2 * scalar;
10432 C(i+1UL,j ) += value3 * scalar;
10433 C(i+1UL,j+1UL) += value4 * scalar;
10434 C(i+2UL,j ) += value5 * scalar;
10435 C(i+2UL,j+1UL) += value6 * scalar;
10436 C(i+3UL,j ) += value7 * scalar;
10437 C(i+3UL,j+1UL) += value8 * scalar;
10438 }
10439 }
10440
10441 if( j < N )
10442 {
10443 const size_t kbegin( ( IsUpper_v<MT4> )
10444 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10445 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10446 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
10447
10448 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
10449 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
10450
10451 size_t k( kbegin );
10452
10453 if( k < kpos )
10454 {
10455 SIMDType b1( B.load(k,j) );
10456 SIMDType xmm1( A.load(i ,k) * b1 );
10457 SIMDType xmm2( A.load(i+1UL,k) * b1 );
10458 SIMDType xmm3( A.load(i+2UL,k) * b1 );
10459 SIMDType xmm4( A.load(i+3UL,k) * b1 );
10460
10461 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10462 b1 = B.load(k,j);
10463 xmm1 += A.load(i ,k) * b1;
10464 xmm2 += A.load(i+1UL,k) * b1;
10465 xmm3 += A.load(i+2UL,k) * b1;
10466 xmm4 += A.load(i+3UL,k) * b1;
10467 }
10468
10469 C(i ,j) += sum( xmm1 ) * scalar;
10470 C(i+1UL,j) += sum( xmm2 ) * scalar;
10471 C(i+2UL,j) += sum( xmm3 ) * scalar;
10472 C(i+3UL,j) += sum( xmm4 ) * scalar;
10473
10474 for( ; remainder && k<kend; ++k ) {
10475 C(i ,j) += A(i ,k) * B(k,j) * scalar;
10476 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
10477 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
10478 C(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
10479 }
10480 }
10481 else if( k < kend )
10482 {
10483 ElementType value1( A(i ,k) * B(k,j) );
10484 ElementType value2( A(i+1UL,k) * B(k,j) );
10485 ElementType value3( A(i+2UL,k) * B(k,j) );
10486 ElementType value4( A(i+3UL,k) * B(k,j) );
10487
10488 for( ++k; k<kend; ++k ) {
10489 value1 += A(i ,k) * B(k,j);
10490 value2 += A(i+1UL,k) * B(k,j);
10491 value3 += A(i+2UL,k) * B(k,j);
10492 value4 += A(i+3UL,k) * B(k,j);
10493 }
10494
10495 C(i ,j) += value1 * scalar;
10496 C(i+1UL,j) += value2 * scalar;
10497 C(i+2UL,j) += value3 * scalar;
10498 C(i+3UL,j) += value4 * scalar;
10499 }
10500 }
10501 }
10502
10503 for( ; !LOW && !UPP && (i+3UL) <= M; i+=3UL )
10504 {
10505 size_t j( 0UL );
10506
10507 for( ; (j+3UL) <= N; j+=3UL )
10508 {
10509 const size_t kbegin( ( IsUpper_v<MT4> )
10510 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10511 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10512 const size_t kend( ( IsLower_v<MT4> )
10513 ?( IsUpper_v<MT5> ? min( i+3UL, j+3UL ) : ( i+3UL ) )
10514 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
10515
10516 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
10517 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
10518
10519 size_t k( kbegin );
10520
10521 if( k < kpos )
10522 {
10523 SIMDType a1( A.load(i ,k) );
10524 SIMDType a2( A.load(i+1UL,k) );
10525 SIMDType a3( A.load(i+2UL,k) );
10526 SIMDType b1( B.load(k,j ) );
10527 SIMDType b2( B.load(k,j+1UL) );
10528 SIMDType b3( B.load(k,j+2UL) );
10529 SIMDType xmm1( a1 * b1 );
10530 SIMDType xmm2( a1 * b2 );
10531 SIMDType xmm3( a1 * b3 );
10532 SIMDType xmm4( a2 * b1 );
10533 SIMDType xmm5( a2 * b2 );
10534 SIMDType xmm6( a2 * b3 );
10535 SIMDType xmm7( a3 * b1 );
10536 SIMDType xmm8( a3 * b2 );
10537 SIMDType xmm9( a3 * b3 );
10538
10539 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10540 a1 = A.load(i ,k);
10541 a2 = A.load(i+1UL,k);
10542 a3 = A.load(i+2UL,k);
10543 b1 = B.load(k,j );
10544 b2 = B.load(k,j+1UL);
10545 b3 = B.load(k,j+2UL);
10546 xmm1 += a1 * b1;
10547 xmm2 += a1 * b2;
10548 xmm3 += a1 * b3;
10549 xmm4 += a2 * b1;
10550 xmm5 += a2 * b2;
10551 xmm6 += a2 * b3;
10552 xmm7 += a3 * b1;
10553 xmm8 += a3 * b2;
10554 xmm9 += a3 * b3;
10555 }
10556
10557 C(i ,j ) += sum( xmm1 ) * scalar;
10558 C(i ,j+1UL) += sum( xmm2 ) * scalar;
10559 C(i ,j+2UL) += sum( xmm3 ) * scalar;
10560 C(i+1UL,j ) += sum( xmm4 ) * scalar;
10561 C(i+1UL,j+1UL) += sum( xmm5 ) * scalar;
10562 C(i+1UL,j+2UL) += sum( xmm6 ) * scalar;
10563 C(i+2UL,j ) += sum( xmm7 ) * scalar;
10564 C(i+2UL,j+1UL) += sum( xmm8 ) * scalar;
10565 C(i+2UL,j+2UL) += sum( xmm9 ) * scalar;
10566
10567 for( ; remainder && k<kend; ++k ) {
10568 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
10569 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
10570 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
10571 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
10572 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
10573 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
10574 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
10575 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
10576 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL) * scalar;
10577 }
10578 }
10579 else if( k < kend )
10580 {
10581 ElementType value1( A(i ,k) * B(k,j ) );
10582 ElementType value2( A(i ,k) * B(k,j+1UL) );
10583 ElementType value3( A(i ,k) * B(k,j+2UL) );
10584 ElementType value4( A(i+1UL,k) * B(k,j ) );
10585 ElementType value5( A(i+1UL,k) * B(k,j+1UL) );
10586 ElementType value6( A(i+1UL,k) * B(k,j+2UL) );
10587 ElementType value7( A(i+2UL,k) * B(k,j ) );
10588 ElementType value8( A(i+2UL,k) * B(k,j+1UL) );
10589 ElementType value9( A(i+2UL,k) * B(k,j+2UL) );
10590
10591 for( ++k; k<kend; ++k ) {
10592 value1 += A(i ,k) * B(k,j );
10593 value2 += A(i ,k) * B(k,j+1UL);
10594 value3 += A(i ,k) * B(k,j+2UL);
10595 value4 += A(i+1UL,k) * B(k,j );
10596 value5 += A(i+1UL,k) * B(k,j+1UL);
10597 value6 += A(i+1UL,k) * B(k,j+2UL);
10598 value7 += A(i+2UL,k) * B(k,j );
10599 value8 += A(i+2UL,k) * B(k,j+1UL);
10600 value9 += A(i+2UL,k) * B(k,j+2UL);
10601 }
10602
10603 C(i ,j ) += value1 * scalar;
10604 C(i ,j+1UL) += value2 * scalar;
10605 C(i ,j+2UL) += value3 * scalar;
10606 C(i+1UL,j ) += value4 * scalar;
10607 C(i+1UL,j+1UL) += value5 * scalar;
10608 C(i+1UL,j+2UL) += value6 * scalar;
10609 C(i+2UL,j ) += value7 * scalar;
10610 C(i+2UL,j+1UL) += value8 * scalar;
10611 C(i+2UL,j+2UL) += value9 * scalar;
10612 }
10613 }
10614
10615 for( ; (j+2UL) <= N; j+=2UL )
10616 {
10617 const size_t kbegin( ( IsUpper_v<MT4> )
10618 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10619 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10620 const size_t kend( ( IsLower_v<MT4> )
10621 ?( IsUpper_v<MT5> ? min( i+3UL, j+2UL ) : ( i+3UL ) )
10622 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
10623
10624 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
10625 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
10626
10627 size_t k( kbegin );
10628
10629 if( k < kpos )
10630 {
10631 SIMDType a1( A.load(i ,k) );
10632 SIMDType a2( A.load(i+1UL,k) );
10633 SIMDType a3( A.load(i+2UL,k) );
10634 SIMDType b1( B.load(k,j ) );
10635 SIMDType b2( B.load(k,j+1UL) );
10636 SIMDType xmm1( a1 * b1 );
10637 SIMDType xmm2( a1 * b2 );
10638 SIMDType xmm3( a2 * b1 );
10639 SIMDType xmm4( a2 * b2 );
10640 SIMDType xmm5( a3 * b1 );
10641 SIMDType xmm6( a3 * b2 );
10642
10643 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10644 a1 = A.load(i ,k);
10645 a2 = A.load(i+1UL,k);
10646 a3 = A.load(i+2UL,k);
10647 b1 = B.load(k,j );
10648 b2 = B.load(k,j+1UL);
10649 xmm1 += a1 * b1;
10650 xmm2 += a1 * b2;
10651 xmm3 += a2 * b1;
10652 xmm4 += a2 * b2;
10653 xmm5 += a3 * b1;
10654 xmm6 += a3 * b2;
10655 }
10656
10657 C(i ,j ) += sum( xmm1 ) * scalar;
10658 C(i ,j+1UL) += sum( xmm2 ) * scalar;
10659 C(i+1UL,j ) += sum( xmm3 ) * scalar;
10660 C(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
10661 C(i+2UL,j ) += sum( xmm5 ) * scalar;
10662 C(i+2UL,j+1UL) += sum( xmm6 ) * scalar;
10663
10664 for( ; remainder && k<kend; ++k ) {
10665 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
10666 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
10667 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
10668 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
10669 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
10670 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
10671 }
10672 }
10673 else if( k < kend )
10674 {
10675 ElementType value1( A(i ,k) * B(k,j ) );
10676 ElementType value2( A(i ,k) * B(k,j+1UL) );
10677 ElementType value3( A(i+1UL,k) * B(k,j ) );
10678 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
10679 ElementType value5( A(i+2UL,k) * B(k,j ) );
10680 ElementType value6( A(i+2UL,k) * B(k,j+1UL) );
10681
10682 for( ++k; k<kend; ++k ) {
10683 value1 += A(i ,k) * B(k,j );
10684 value2 += A(i ,k) * B(k,j+1UL);
10685 value3 += A(i+1UL,k) * B(k,j );
10686 value4 += A(i+1UL,k) * B(k,j+1UL);
10687 value5 += A(i+2UL,k) * B(k,j );
10688 value6 += A(i+2UL,k) * B(k,j+1UL);
10689 }
10690
10691 C(i ,j ) += value1 * scalar;
10692 C(i ,j+1UL) += value2 * scalar;
10693 C(i+1UL,j ) += value3 * scalar;
10694 C(i+1UL,j+1UL) += value4 * scalar;
10695 C(i+2UL,j ) += value5 * scalar;
10696 C(i+2UL,j+1UL) += value6 * scalar;
10697 }
10698 }
10699
10700 if( j < N )
10701 {
10702 const size_t kbegin( ( IsUpper_v<MT4> )
10703 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10704 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10705 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
10706
10707 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
10708 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
10709
10710 size_t k( kbegin );
10711
10712 if( k < kpos )
10713 {
10714 SIMDType b1( B.load(k,j) );
10715 SIMDType xmm1( A.load(i ,k) * b1 );
10716 SIMDType xmm2( A.load(i+1UL,k) * b1 );
10717 SIMDType xmm3( A.load(i+2UL,k) * b1 );
10718
10719 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10720 b1 = B.load(k,j);
10721 xmm1 += A.load(i ,k) * b1;
10722 xmm2 += A.load(i+1UL,k) * b1;
10723 xmm3 += A.load(i+2UL,k) * b1;
10724 }
10725
10726 C(i ,j) += sum( xmm1 ) * scalar;
10727 C(i+1UL,j) += sum( xmm2 ) * scalar;
10728 C(i+2UL,j) += sum( xmm3 ) * scalar;
10729
10730 for( ; remainder && k<kend; ++k ) {
10731 C(i ,j) += A(i ,k) * B(k,j) * scalar;
10732 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
10733 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
10734 }
10735 }
10736 else if( k < kend )
10737 {
10738 ElementType value1( A(i ,k) * B(k,j) );
10739 ElementType value2( A(i+1UL,k) * B(k,j) );
10740 ElementType value3( A(i+2UL,k) * B(k,j) );
10741
10742 for( ++k; k<kend; ++k ) {
10743 value1 += A(i ,k) * B(k,j);
10744 value2 += A(i+1UL,k) * B(k,j);
10745 value3 += A(i+2UL,k) * B(k,j);
10746 }
10747
10748 C(i ,j) += value1 * scalar;
10749 C(i+1UL,j) += value2 * scalar;
10750 C(i+2UL,j) += value3 * scalar;
10751 }
10752 }
10753 }
10754
10755 for( ; (i+2UL) <= M; i+=2UL )
10756 {
10757 const size_t jend( LOW ? i+2UL : N );
10758 size_t j( UPP ? i : 0UL );
10759
10760 for( ; (j+2UL) <= jend; j+=2UL )
10761 {
10762 const size_t kbegin( ( IsUpper_v<MT4> )
10763 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10764 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10765 const size_t kend( ( IsLower_v<MT4> )
10766 ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
10767 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
10768
10769 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
10770 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
10771
10772 size_t k( kbegin );
10773
10774 if( k < kpos )
10775 {
10776 SIMDType a1( A.load(i ,k) );
10777 SIMDType a2( A.load(i+1UL,k) );
10778 SIMDType b1( B.load(k,j ) );
10779 SIMDType b2( B.load(k,j+1UL) );
10780 SIMDType xmm1( a1 * b1 );
10781 SIMDType xmm2( a1 * b2 );
10782 SIMDType xmm3( a2 * b1 );
10783 SIMDType xmm4( a2 * b2 );
10784
10785 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10786 a1 = A.load(i ,k);
10787 a2 = A.load(i+1UL,k);
10788 b1 = B.load(k,j );
10789 b2 = B.load(k,j+1UL);
10790 xmm1 += a1 * b1;
10791 xmm2 += a1 * b2;
10792 xmm3 += a2 * b1;
10793 xmm4 += a2 * b2;
10794 }
10795
10796 C(i ,j ) += sum( xmm1 ) * scalar;
10797 C(i ,j+1UL) += sum( xmm2 ) * scalar;
10798 C(i+1UL,j ) += sum( xmm3 ) * scalar;
10799 C(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
10800
10801 for( ; remainder && k<kend; ++k ) {
10802 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
10803 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
10804 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
10805 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
10806 }
10807 }
10808 else if( k < kend )
10809 {
10810 ElementType value1( A(i ,k) * B(k,j ) );
10811 ElementType value2( A(i ,k) * B(k,j+1UL) );
10812 ElementType value3( A(i+1UL,k) * B(k,j ) );
10813 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
10814
10815 for( ++k; k<kend; ++k ) {
10816 value1 += A(i ,k) * B(k,j );
10817 value2 += A(i ,k) * B(k,j+1UL);
10818 value3 += A(i+1UL,k) * B(k,j );
10819 value4 += A(i+1UL,k) * B(k,j+1UL);
10820 }
10821
10822 C(i ,j ) += value1 * scalar;
10823 C(i ,j+1UL) += value2 * scalar;
10824 C(i+1UL,j ) += value3 * scalar;
10825 C(i+1UL,j+1UL) += value4 * scalar;
10826 }
10827 }
10828
10829 if( j < jend )
10830 {
10831 const size_t kbegin( ( IsUpper_v<MT4> )
10832 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10833 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10834 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
10835
10836 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
10837 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
10838
10839 size_t k( kbegin );
10840
10841 if( k < kpos )
10842 {
10843 SIMDType b1( B.load(k,j) );
10844 SIMDType xmm1( A.load(i ,k) * b1 );
10845 SIMDType xmm2( A.load(i+1UL,k) * b1 );
10846
10847 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10848 b1 = B.load(k,j);
10849 xmm1 += A.load(i ,k) * b1;
10850 xmm2 += A.load(i+1UL,k) * b1;
10851 }
10852
10853 C(i ,j) += sum( xmm1 ) * scalar;
10854 C(i+1UL,j) += sum( xmm2 ) * scalar;
10855
10856 for( ; remainder && k<kend; ++k ) {
10857 C(i ,j) += A(i ,k) * B(k,j) * scalar;
10858 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
10859 }
10860 }
10861 else if( k < kend )
10862 {
10863 ElementType value1( A(i ,k) * B(k,j) );
10864 ElementType value2( A(i+1UL,k) * B(k,j) );
10865
10866 for( ++k; k<kend; ++k ) {
10867 value1 += A(i ,k) * B(k,j);
10868 value2 += A(i+1UL,k) * B(k,j);
10869 }
10870
10871 C(i ,j) += value1 * scalar;
10872 C(i+1UL,j) += value2 * scalar;
10873 }
10874 }
10875 }
10876
10877 if( i < M )
10878 {
10879 const size_t jend( LOW ? i+1UL : N );
10880 size_t j( UPP ? i : 0UL );
10881
10882 for( ; (j+2UL) <= jend; j+=2UL )
10883 {
10884 const size_t kbegin( ( IsUpper_v<MT4> )
10885 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10886 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10887 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
10888
10889 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
10890 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
10891
10892 size_t k( kbegin );
10893
10894 if( k < kpos )
10895 {
10896 SIMDType a1( A.load(i,k) );
10897 SIMDType xmm1( a1 * B.load(k,j ) );
10898 SIMDType xmm2( a1 * B.load(k,j+1UL) );
10899
10900 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10901 a1 = A.load(i,k);
10902 xmm1 += a1 * B.load(k,j );
10903 xmm2 += a1 * B.load(k,j+1UL);
10904 }
10905
10906 C(i,j ) += sum( xmm1 ) * scalar;
10907 C(i,j+1UL) += sum( xmm2 ) * scalar;
10908
10909 for( ; remainder && k<kend; ++k ) {
10910 C(i,j ) += A(i,k) * B(k,j ) * scalar;
10911 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
10912 }
10913 }
10914 else if( k < kend )
10915 {
10916 ElementType value1( A(i,k) * B(k,j ) );
10917 ElementType value2( A(i,k) * B(k,j+1UL) );
10918
10919 for( ++k; k<kend; ++k ) {
10920 value1 += A(i,k) * B(k,j );
10921 value2 += A(i,k) * B(k,j+1UL);
10922 }
10923
10924 C(i,j ) += value1 * scalar;
10925 C(i,j+1UL) += value2 * scalar;
10926 }
10927 }
10928
10929 if( j < jend )
10930 {
10931 const size_t kbegin( ( IsUpper_v<MT4> )
10932 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
10933 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
10934
10935 const size_t kpos( remainder ? prevMultiple( K, SIMDSIZE ) : K );
10936 BLAZE_INTERNAL_ASSERT( kpos <= K, "Invalid end calculation" );
10937
10938 size_t k( kbegin );
10939
10940 if( k < kpos )
10941 {
10942 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
10943
10944 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
10945 xmm1 += A.load(i,k) * B.load(k,j);
10946 }
10947
10948 C(i,j) += sum( xmm1 ) * scalar;
10949
10950 for( ; remainder && k<K; ++k ) {
10951 C(i,j) += A(i,k) * B(k,j) * scalar;
10952 }
10953 }
10954 else if( k < K )
10955 {
10956 ElementType value( A(i,k) * B(k,j) );
10957
10958 for( ++k; k<K; ++k ) {
10959 value += A(i,k) * B(k,j);
10960 }
10961
10962 C(i,j) += value * scalar;
10963 }
10964 }
10965 }
10966 }
10967 //**********************************************************************************************
10968
10969 //**Default addition assignment to dense matrices (large matrices)******************************
10983 template< typename MT3 // Type of the left-hand side target matrix
10984 , typename MT4 // Type of the left-hand side matrix operand
10985 , typename MT5 // Type of the right-hand side matrix operand
10986 , typename ST2 > // Type of the scalar value
10987 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10988 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10989 {
10990 selectDefaultAddAssignKernel( C, A, B, scalar );
10991 }
10992 //**********************************************************************************************
10993
10994 //**Vectorized default addition assignment to dense matrices (large matrices)*******************
11009 template< typename MT3 // Type of the left-hand side target matrix
11010 , typename MT4 // Type of the left-hand side matrix operand
11011 , typename MT5 // Type of the right-hand side matrix operand
11012 , typename ST2 > // Type of the scalar value
11013 static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11014 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11015 {
11016 if( LOW )
11017 lmmm( C, A, B, scalar, ST2(1) );
11018 else if( UPP )
11019 ummm( C, A, B, scalar, ST2(1) );
11020 else
11021 mmm( C, A, B, scalar, ST2(1) );
11022 }
11023 //**********************************************************************************************
11024
11025 //**BLAS-based addition assignment to dense matrices (default)**********************************
11039 template< typename MT3 // Type of the left-hand side target matrix
11040 , typename MT4 // Type of the left-hand side matrix operand
11041 , typename MT5 // Type of the right-hand side matrix operand
11042 , typename ST2 > // Type of the scalar value
11043 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11044 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
11045 {
11046 selectLargeAddAssignKernel( C, A, B, scalar );
11047 }
11048 //**********************************************************************************************
11049
11050 //**BLAS-based addition assignment to dense matrices********************************************
11051#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
11065 template< typename MT3 // Type of the left-hand side target matrix
11066 , typename MT4 // Type of the left-hand side matrix operand
11067 , typename MT5 // Type of the right-hand side matrix operand
11068 , typename ST2 > // Type of the scalar value
11069 static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11070 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
11071 {
11072 using ET = ElementType_t<MT3>;
11073
11074 if( IsTriangular_v<MT4> ) {
11075 ResultType_t<MT3> tmp( serial( B ) );
11076 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
11077 addAssign( C, tmp );
11078 }
11079 else if( IsTriangular_v<MT5> ) {
11080 ResultType_t<MT3> tmp( serial( A ) );
11081 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
11082 addAssign( C, tmp );
11083 }
11084 else {
11085 gemm( C, A, B, ET(scalar), ET(1) );
11086 }
11087 }
11088#endif
11089 //**********************************************************************************************
11090
11091 //**Addition assignment to sparse matrices******************************************************
11092 // No special implementation for the addition assignment to sparse matrices.
11093 //**********************************************************************************************
11094
11095 //**Subtraction assignment to dense matrices****************************************************
11107 template< typename MT // Type of the target dense matrix
11108 , bool SO > // Storage order of the target dense matrix
11109 friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11110 {
11112
11113 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
11114 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
11115
11116 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
11117 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
11118
11119 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
11120 return;
11121 }
11122
11123 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
11124 RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
11125
11126 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11127 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11128 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11129 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11130 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
11131 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
11132
11133 DMatScalarMultExpr::selectSubAssignKernel( *lhs, A, B, rhs.scalar_ );
11134 }
11135 //**********************************************************************************************
11136
11137 //**Subtraction assignment to dense matrices (kernel selection)*********************************
11148 template< typename MT3 // Type of the left-hand side target matrix
11149 , typename MT4 // Type of the left-hand side matrix operand
11150 , typename MT5 // Type of the right-hand side matrix operand
11151 , typename ST2 > // Type of the scalar value
11152 static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11153 {
11154 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
11155 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
11156 selectSmallSubAssignKernel( C, A, B, scalar );
11157 else
11158 selectBlasSubAssignKernel( C, A, B, scalar );
11159 }
11160 //**********************************************************************************************
11161
11162 //**Default subtraction assignment to dense matrices (general/general)**************************
11176 template< typename MT3 // Type of the left-hand side target matrix
11177 , typename MT4 // Type of the left-hand side matrix operand
11178 , typename MT5 // Type of the right-hand side matrix operand
11179 , typename ST2 > // Type of the scalar value
11180 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11181 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11182 {
11183 const ResultType tmp( serial( A * B * scalar ) );
11184 subAssign( C, tmp );
11185 }
11186 //**********************************************************************************************
11187
11188 //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
11202 template< typename MT3 // Type of the left-hand side target matrix
11203 , typename MT4 // Type of the left-hand side matrix operand
11204 , typename MT5 // Type of the right-hand side matrix operand
11205 , typename ST2 > // Type of the scalar value
11206 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11207 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11208 {
11209 const size_t M( A.rows() );
11210 const size_t N( B.columns() );
11211
11212 for( size_t i=0UL; i<M; ++i )
11213 {
11214 const size_t jbegin( ( IsUpper_v<MT4> )
11215 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
11216 :( 0UL ) );
11217 const size_t jend( ( IsLower_v<MT4> )
11218 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
11219 :( N ) );
11220 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
11221
11222 const size_t jnum( jend - jbegin );
11223 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
11224 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
11225
11226 for( size_t j=jbegin; j<jpos; j+=2UL ) {
11227 C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
11228 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
11229 }
11230 if( jpos < jend ) {
11231 C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
11232 }
11233 }
11234 }
11235 //**********************************************************************************************
11236
11237 //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
11251 template< typename MT3 // Type of the left-hand side target matrix
11252 , typename MT4 // Type of the left-hand side matrix operand
11253 , typename MT5 // Type of the right-hand side matrix operand
11254 , typename ST2 > // Type of the scalar value
11255 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11256 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11257 {
11258 constexpr size_t block( BLOCK_SIZE );
11259
11260 const size_t M( A.rows() );
11261 const size_t N( B.columns() );
11262
11263 for( size_t jj=0UL; jj<N; jj+=block ) {
11264 const size_t jend( min( N, jj+block ) );
11265 for( size_t ii=0UL; ii<M; ii+=block ) {
11266 const size_t iend( min( M, ii+block ) );
11267 for( size_t j=jj; j<jend; ++j )
11268 {
11269 const size_t ibegin( ( IsLower_v<MT4> )
11270 ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
11271 :( ii ) );
11272 const size_t ipos( ( IsUpper_v<MT4> )
11273 ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
11274 :( iend ) );
11275
11276 for( size_t i=ibegin; i<ipos; ++i ) {
11277 C(i,j) -= A(i,j) * B(j,j) * scalar;
11278 }
11279 }
11280 }
11281 }
11282 }
11283 //**********************************************************************************************
11284
11285 //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
11300 template< typename MT3 // Type of the left-hand side target matrix
11301 , typename MT4 // Type of the left-hand side matrix operand
11302 , typename MT5 // Type of the right-hand side matrix operand
11303 , typename ST2 > // Type of the scalar value
11304 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11305 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11306 {
11307 constexpr size_t block( BLOCK_SIZE );
11308
11309 const size_t M( A.rows() );
11310 const size_t N( B.columns() );
11311
11312 for( size_t ii=0UL; ii<M; ii+=block ) {
11313 const size_t iend( min( M, ii+block ) );
11314 for( size_t jj=0UL; jj<N; jj+=block ) {
11315 const size_t jend( min( N, jj+block ) );
11316 for( size_t i=ii; i<iend; ++i )
11317 {
11318 const size_t jbegin( ( IsUpper_v<MT5> )
11319 ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
11320 :( jj ) );
11321 const size_t jpos( ( IsLower_v<MT5> )
11322 ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
11323 :( jend ) );
11324
11325 for( size_t j=jbegin; j<jpos; ++j ) {
11326 C(i,j) -= A(i,i) * B(i,j) * scalar;
11327 }
11328 }
11329 }
11330 }
11331 }
11332 //**********************************************************************************************
11333
11334 //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
11349 template< typename MT3 // Type of the left-hand side target matrix
11350 , typename MT4 // Type of the left-hand side matrix operand
11351 , typename MT5 // Type of the right-hand side matrix operand
11352 , typename ST2 > // Type of the scalar value
11353 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11354 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11355 {
11356 const size_t M( A.rows() );
11357 const size_t N( B.columns() );
11358
11359 for( size_t j=0UL; j<N; ++j )
11360 {
11361 const size_t ibegin( ( IsLower_v<MT5> )
11362 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
11363 :( 0UL ) );
11364 const size_t iend( ( IsUpper_v<MT5> )
11365 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
11366 :( M ) );
11367 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
11368
11369 const size_t inum( iend - ibegin );
11370 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
11371 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
11372
11373 for( size_t i=ibegin; i<ipos; i+=2UL ) {
11374 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
11375 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
11376 }
11377 if( ipos < iend ) {
11378 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
11379 }
11380 }
11381 }
11382 //**********************************************************************************************
11383
11384 //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
11398 template< typename MT3 // Type of the left-hand side target matrix
11399 , typename MT4 // Type of the left-hand side matrix operand
11400 , typename MT5 // Type of the right-hand side matrix operand
11401 , typename ST2 > // Type of the scalar value
11402 static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11403 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11404 {
11405 for( size_t i=0UL; i<A.rows(); ++i ) {
11406 C(i,i) -= A(i,i) * B(i,i) * scalar;
11407 }
11408 }
11409 //**********************************************************************************************
11410
11411 //**Default subtraction assignment to dense matrices (small matrices)***************************
11425 template< typename MT3 // Type of the left-hand side target matrix
11426 , typename MT4 // Type of the left-hand side matrix operand
11427 , typename MT5 // Type of the right-hand side matrix operand
11428 , typename ST2 > // Type of the scalar value
11429 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11430 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11431 {
11432 selectDefaultSubAssignKernel( C, A, B, scalar );
11433 }
11434 //**********************************************************************************************
11435
11436 //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
11451 template< typename MT3 // Type of the left-hand side target matrix
11452 , typename MT4 // Type of the left-hand side matrix operand
11453 , typename MT5 // Type of the right-hand side matrix operand
11454 , typename ST2 > // Type of the scalar value
11455 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11456 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11457 {
11458 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
11459
11460 const size_t M( A.rows() );
11461 const size_t N( B.columns() );
11462 const size_t K( A.columns() );
11463
11464 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
11465
11466 size_t i( 0UL );
11467
11468 for( ; !( LOW && UPP ) && (i+3UL) <= M; i+=3UL )
11469 {
11470 const size_t jend( LOW ? i+3UL : N );
11471 size_t j( UPP ? i : 0UL );
11472
11473 for( ; (j+3UL) <= jend; j+=3UL )
11474 {
11475 const size_t kbegin( ( IsUpper_v<MT4> )
11476 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
11477 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
11478 const size_t kend( ( IsLower_v<MT4> )
11479 ?( IsUpper_v<MT5> ? min( i+3UL, j+3UL ) : ( i+3UL ) )
11480 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
11481
11482 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
11483 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
11484
11485 size_t k( kbegin );
11486
11487 if( k < kpos )
11488 {
11489 SIMDType a1( A.load(i ,k) );
11490 SIMDType a2( A.load(i+1UL,k) );
11491 SIMDType a3( A.load(i+2UL,k) );
11492 SIMDType b1( B.load(k,j ) );
11493 SIMDType b2( B.load(k,j+1UL) );
11494 SIMDType b3( B.load(k,j+2UL) );
11495 SIMDType xmm1( a1 * b1 );
11496 SIMDType xmm2( a1 * b2 );
11497 SIMDType xmm3( a1 * b3 );
11498 SIMDType xmm4( a2 * b1 );
11499 SIMDType xmm5( a2 * b2 );
11500 SIMDType xmm6( a2 * b3 );
11501 SIMDType xmm7( a3 * b1 );
11502 SIMDType xmm8( a3 * b2 );
11503 SIMDType xmm9( a3 * b3 );
11504
11505 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
11506 a1 = A.load(i ,k);
11507 a2 = A.load(i+1UL,k);
11508 a3 = A.load(i+2UL,k);
11509 b1 = B.load(k,j );
11510 b2 = B.load(k,j+1UL);
11511 b3 = B.load(k,j+2UL);
11512 xmm1 += a1 * b1;
11513 xmm2 += a1 * b2;
11514 xmm3 += a1 * b3;
11515 xmm4 += a2 * b1;
11516 xmm5 += a2 * b2;
11517 xmm6 += a2 * b3;
11518 xmm7 += a3 * b1;
11519 xmm8 += a3 * b2;
11520 xmm9 += a3 * b3;
11521 }
11522
11523 C(i ,j ) -= sum( xmm1 ) * scalar;
11524 C(i ,j+1UL) -= sum( xmm2 ) * scalar;
11525 C(i ,j+2UL) -= sum( xmm3 ) * scalar;
11526 C(i+1UL,j ) -= sum( xmm4 ) * scalar;
11527 C(i+1UL,j+1UL) -= sum( xmm5 ) * scalar;
11528 C(i+1UL,j+2UL) -= sum( xmm6 ) * scalar;
11529 C(i+2UL,j ) -= sum( xmm7 ) * scalar;
11530 C(i+2UL,j+1UL) -= sum( xmm8 ) * scalar;
11531 C(i+2UL,j+2UL) -= sum( xmm9 ) * scalar;
11532
11533 for( ; remainder && k<kend; ++k ) {
11534 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
11535 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
11536 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
11537 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
11538 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
11539 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
11540 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
11541 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
11542 C(i+2UL,j+2UL) -= A(i+2UL,k) * B(k,j+2UL) * scalar;
11543 }
11544 }
11545 else if( k < kend )
11546 {
11547 ElementType value1( A(i ,k) * B(k,j ) );
11548 ElementType value2( A(i ,k) * B(k,j+1UL) );
11549 ElementType value3( A(i ,k) * B(k,j+2UL) );
11550 ElementType value4( A(i+1UL,k) * B(k,j ) );
11551 ElementType value5( A(i+1UL,k) * B(k,j+1UL) );
11552 ElementType value6( A(i+1UL,k) * B(k,j+2UL) );
11553 ElementType value7( A(i+2UL,k) * B(k,j ) );
11554 ElementType value8( A(i+2UL,k) * B(k,j+1UL) );
11555 ElementType value9( A(i+2UL,k) * B(k,j+2UL) );
11556
11557 for( ++k; k<kend; ++k ) {
11558 value1 += A(i ,k) * B(k,j );
11559 value2 += A(i ,k) * B(k,j+1UL);
11560 value3 += A(i ,k) * B(k,j+2UL);
11561 value4 += A(i+1UL,k) * B(k,j );
11562 value5 += A(i+1UL,k) * B(k,j+1UL);
11563 value6 += A(i+1UL,k) * B(k,j+2UL);
11564 value7 += A(i+2UL,k) * B(k,j );
11565 value8 += A(i+2UL,k) * B(k,j+1UL);
11566 value9 += A(i+2UL,k) * B(k,j+2UL);
11567 }
11568
11569 C(i ,j ) -= value1 * scalar;
11570 C(i ,j+1UL) -= value2 * scalar;
11571 C(i ,j+2UL) -= value3 * scalar;
11572 C(i+1UL,j ) -= value4 * scalar;
11573 C(i+1UL,j+1UL) -= value5 * scalar;
11574 C(i+1UL,j+2UL) -= value6 * scalar;
11575 C(i+2UL,j ) -= value7 * scalar;
11576 C(i+2UL,j+1UL) -= value8 * scalar;
11577 C(i+2UL,j+2UL) -= value9 * scalar;
11578 }
11579 }
11580
11581 for( ; (j+2UL) <= jend; j+=2UL )
11582 {
11583 const size_t kbegin( ( IsUpper_v<MT4> )
11584 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
11585 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
11586 const size_t kend( ( IsLower_v<MT4> )
11587 ?( IsUpper_v<MT5> ? min( i+3UL, j+2UL ) : ( i+3UL ) )
11588 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
11589
11590 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
11591 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
11592
11593 size_t k( kbegin );
11594
11595 if( k < kpos )
11596 {
11597 SIMDType a1( A.load(i ,k) );
11598 SIMDType a2( A.load(i+1UL,k) );
11599 SIMDType a3( A.load(i+2UL,k) );
11600 SIMDType b1( B.load(k,j ) );
11601 SIMDType b2( B.load(k,j+1UL) );
11602 SIMDType xmm1( a1 * b1 );
11603 SIMDType xmm2( a1 * b2 );
11604 SIMDType xmm3( a2 * b1 );
11605 SIMDType xmm4( a2 * b2 );
11606 SIMDType xmm5( a3 * b1 );
11607 SIMDType xmm6( a3 * b2 );
11608
11609 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
11610 a1 = A.load(i ,k);
11611 a2 = A.load(i+1UL,k);
11612 a3 = A.load(i+2UL,k);
11613 b1 = B.load(k,j );
11614 b2 = B.load(k,j+1UL);
11615 xmm1 += a1 * b1;
11616 xmm2 += a1 * b2;
11617 xmm3 += a2 * b1;
11618 xmm4 += a2 * b2;
11619 xmm5 += a3 * b1;
11620 xmm6 += a3 * b2;
11621 }
11622
11623 C(i ,j ) -= sum( xmm1 ) * scalar;
11624 C(i ,j+1UL) -= sum( xmm2 ) * scalar;
11625 C(i+1UL,j ) -= sum( xmm3 ) * scalar;
11626 C(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
11627 C(i+2UL,j ) -= sum( xmm5 ) * scalar;
11628 C(i+2UL,j+1UL) -= sum( xmm6 ) * scalar;
11629
11630 for( ; remainder && k<kend; ++k ) {
11631 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
11632 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
11633 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
11634 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
11635 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
11636 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
11637 }
11638 }
11639 else if( k < kend )
11640 {
11641 ElementType value1( A(i ,k) * B(k,j ) );
11642 ElementType value2( A(i ,k) * B(k,j+1UL) );
11643 ElementType value3( A(i+1UL,k) * B(k,j ) );
11644 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
11645 ElementType value5( A(i+2UL,k) * B(k,j ) );
11646 ElementType value6( A(i+2UL,k) * B(k,j+1UL) );
11647
11648 for( ++k; k<kend; ++k ) {
11649 value1 += A(i ,k) * B(k,j );
11650 value2 += A(i ,k) * B(k,j+1UL);
11651 value3 += A(i+1UL,k) * B(k,j );
11652 value4 += A(i+1UL,k) * B(k,j+1UL);
11653 value5 += A(i+2UL,k) * B(k,j );
11654 value6 += A(i+2UL,k) * B(k,j+1UL);
11655 }
11656
11657 C(i ,j ) -= value1 * scalar;
11658 C(i ,j+1UL) -= value2 * scalar;
11659 C(i+1UL,j ) -= value3 * scalar;
11660 C(i+1UL,j+1UL) -= value4 * scalar;
11661 C(i+2UL,j ) -= value5 * scalar;
11662 C(i+2UL,j+1UL) -= value6 * scalar;
11663 }
11664 }
11665
11666 if( j < jend )
11667 {
11668 const size_t kbegin( ( IsUpper_v<MT4> )
11669 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
11670 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
11671 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
11672
11673 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
11674 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
11675
11676 size_t k( kbegin );
11677
11678 if( k < kpos )
11679 {
11680 SIMDType b1( B.load(k,j) );
11681 SIMDType xmm1( A.load(i ,k) * b1 );
11682 SIMDType xmm2( A.load(i+1UL,k) * b1 );
11683 SIMDType xmm3( A.load(i+2UL,k) * b1 );
11684
11685 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
11686 b1 = B.load(k,j);
11687 xmm1 += A.load(i ,k) * b1;
11688 xmm2 += A.load(i+1UL,k) * b1;
11689 xmm3 += A.load(i+2UL,k) * b1;
11690 }
11691
11692 C(i ,j) -= sum( xmm1 ) * scalar;
11693 C(i+1UL,j) -= sum( xmm2 ) * scalar;
11694 C(i+2UL,j) -= sum( xmm3 ) * scalar;
11695
11696 for( ; remainder && k<kend; ++k ) {
11697 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
11698 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
11699 C(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
11700 }
11701 }
11702 else if( k < kend )
11703 {
11704 ElementType value1( A(i ,k) * B(k,j) );
11705 ElementType value2( A(i+1UL,k) * B(k,j) );
11706 ElementType value3( A(i+2UL,k) * B(k,j) );
11707
11708 for( ++k; k<kend; ++k ) {
11709 value1 += A(i ,k) * B(k,j);
11710 value2 += A(i+1UL,k) * B(k,j);
11711 value3 += A(i+2UL,k) * B(k,j);
11712 }
11713
11714 C(i ,j) -= value1 * scalar;
11715 C(i+1UL,j) -= value2 * scalar;
11716 C(i+2UL,j) -= value3 * scalar;
11717 }
11718 }
11719 }
11720
11721 for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
11722 {
11723 const size_t jend( LOW ? i+2UL : N );
11724 size_t j( UPP ? i : 0UL );
11725
11726 for( ; (j+4UL) <= jend; j+=4UL )
11727 {
11728 const size_t kbegin( ( IsUpper_v<MT4> )
11729 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
11730 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
11731 const size_t kend( ( IsLower_v<MT4> )
11732 ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
11733 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
11734
11735 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
11736 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
11737
11738 size_t k( kbegin );
11739
11740 if( k < kpos )
11741 {
11742 SIMDType a1( A.load(i ,k) );
11743 SIMDType a2( A.load(i+1UL,k) );
11744 SIMDType b1( B.load(k,j ) );
11745 SIMDType b2( B.load(k,j+1UL) );
11746 SIMDType b3( B.load(k,j+2UL) );
11747 SIMDType b4( B.load(k,j+3UL) );
11748 SIMDType xmm1( a1 * b1 );
11749 SIMDType xmm2( a1 * b2 );
11750 SIMDType xmm3( a1 * b3 );
11751 SIMDType xmm4( a1 * b4 );
11752 SIMDType xmm5( a2 * b1 );
11753 SIMDType xmm6( a2 * b2 );
11754 SIMDType xmm7( a2 * b3 );
11755 SIMDType xmm8( a2 * b4 );
11756
11757 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
11758 a1 = A.load(i ,k);
11759 a2 = A.load(i+1UL,k);
11760 b1 = B.load(k,j );
11761 b2 = B.load(k,j+1UL);
11762 b3 = B.load(k,j+2UL);
11763 b4 = B.load(k,j+3UL);
11764 xmm1 += a1 * b1;
11765 xmm2 += a1 * b2;
11766 xmm3 += a1 * b3;
11767 xmm4 += a1 * b4;
11768 xmm5 += a2 * b1;
11769 xmm6 += a2 * b2;
11770 xmm7 += a2 * b3;
11771 xmm8 += a2 * b4;
11772 }
11773
11774 C(i ,j ) -= sum( xmm1 ) * scalar;
11775 C(i ,j+1UL) -= sum( xmm2 ) * scalar;
11776 C(i ,j+2UL) -= sum( xmm3 ) * scalar;
11777 C(i ,j+3UL) -= sum( xmm4 ) * scalar;
11778 C(i+1UL,j ) -= sum( xmm5 ) * scalar;
11779 C(i+1UL,j+1UL) -= sum( xmm6 ) * scalar;
11780 C(i+1UL,j+2UL) -= sum( xmm7 ) * scalar;
11781 C(i+1UL,j+3UL) -= sum( xmm8 ) * scalar;
11782
11783 for( ; remainder && k<kend; ++k ) {
11784 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
11785 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
11786 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
11787 C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL) * scalar;
11788 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
11789 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
11790 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
11791 C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL) * scalar;
11792 }
11793 }
11794 else if( k < kend )
11795 {
11796 ElementType value1( A(i ,k) * B(k,j ) );
11797 ElementType value2( A(i ,k) * B(k,j+1UL) );
11798 ElementType value3( A(i ,k) * B(k,j+2UL) );
11799 ElementType value4( A(i ,k) * B(k,j+3UL) );
11800 ElementType value5( A(i+1UL,k) * B(k,j ) );
11801 ElementType value6( A(i+1UL,k) * B(k,j+1UL) );
11802 ElementType value7( A(i+1UL,k) * B(k,j+2UL) );
11803 ElementType value8( A(i+1UL,k) * B(k,j+3UL) );
11804
11805 for( ++k; k<kend; ++k ) {
11806 value1 += A(i ,k) * B(k,j );
11807 value2 += A(i ,k) * B(k,j+1UL);
11808 value3 += A(i ,k) * B(k,j+2UL);
11809 value4 += A(i ,k) * B(k,j+3UL);
11810 value5 += A(i+1UL,k) * B(k,j );
11811 value6 += A(i+1UL,k) * B(k,j+1UL);
11812 value7 += A(i+1UL,k) * B(k,j+2UL);
11813 value8 += A(i+1UL,k) * B(k,j+3UL);
11814 }
11815
11816 C(i ,j ) -= value1 * scalar;
11817 C(i ,j+1UL) -= value2 * scalar;
11818 C(i ,j+2UL) -= value3 * scalar;
11819 C(i ,j+3UL) -= value4 * scalar;
11820 C(i+1UL,j ) -= value5 * scalar;
11821 C(i+1UL,j+1UL) -= value6 * scalar;
11822 C(i+1UL,j+2UL) -= value7 * scalar;
11823 C(i+1UL,j+3UL) -= value8 * scalar;
11824 }
11825 }
11826
11827 for( ; (j+2UL) <= jend; j+=2UL )
11828 {
11829 const size_t kbegin( ( IsUpper_v<MT4> )
11830 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
11831 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
11832 const size_t kend( ( IsLower_v<MT4> )
11833 ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
11834 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
11835
11836 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
11837 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
11838
11839 size_t k( kbegin );
11840
11841 if( k < kpos )
11842 {
11843 SIMDType a1( A.load(i ,k) );
11844 SIMDType a2( A.load(i+1UL,k) );
11845 SIMDType b1( B.load(k,j ) );
11846 SIMDType b2( B.load(k,j+1UL) );
11847 SIMDType xmm1( a1 * b1 );
11848 SIMDType xmm2( a1 * b2 );
11849 SIMDType xmm3( a2 * b1 );
11850 SIMDType xmm4( a2 * b2 );
11851
11852 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
11853 a1 = A.load(i ,k);
11854 a2 = A.load(i+1UL,k);
11855 b1 = B.load(k,j );
11856 b2 = B.load(k,j+1UL);
11857 xmm1 += a1 * b1;
11858 xmm2 += a1 * b2;
11859 xmm3 += a2 * b1;
11860 xmm4 += a2 * b2;
11861 }
11862
11863 C(i ,j ) -= sum( xmm1 ) * scalar;
11864 C(i ,j+1UL) -= sum( xmm2 ) * scalar;
11865 C(i+1UL,j ) -= sum( xmm3 ) * scalar;
11866 C(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
11867
11868 for( ; remainder && k<kend; ++k ) {
11869 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
11870 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
11871 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
11872 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
11873 }
11874 }
11875 else if( k < kend )
11876 {
11877 ElementType value1( A(i ,k) * B(k,j ) );
11878 ElementType value2( A(i ,k) * B(k,j+1UL) );
11879 ElementType value3( A(i+1UL,k) * B(k,j ) );
11880 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
11881
11882 for( ++k; k<kend; ++k ) {
11883 value1 += A(i ,k) * B(k,j );
11884 value2 += A(i ,k) * B(k,j+1UL);
11885 value3 += A(i+1UL,k) * B(k,j );
11886 value4 += A(i+1UL,k) * B(k,j+1UL);
11887 }
11888
11889 C(i ,j ) -= value1 * scalar;
11890 C(i ,j+1UL) -= value2 * scalar;
11891 C(i+1UL,j ) -= value3 * scalar;
11892 C(i+1UL,j+1UL) -= value4 * scalar;
11893 }
11894 }
11895
11896 if( j < jend )
11897 {
11898 const size_t kbegin( ( IsUpper_v<MT4> )
11899 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
11900 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
11901 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
11902
11903 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
11904 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
11905
11906 size_t k( kbegin );
11907
11908 if( k < kpos )
11909 {
11910 SIMDType b1( B.load(k,j) );
11911 SIMDType xmm1( A.load(i ,k) * b1 );
11912 SIMDType xmm2( A.load(i+1UL,k) * b1 );
11913
11914 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
11915 b1 = B.load(k,j);
11916 xmm1 += A.load(i ,k) * b1;
11917 xmm2 += A.load(i+1UL,k) * b1;
11918 }
11919
11920 C(i ,j) -= sum( xmm1 ) * scalar;
11921 C(i+1UL,j) -= sum( xmm2 ) * scalar;
11922
11923 for( ; remainder && k<kend; ++k ) {
11924 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
11925 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
11926 }
11927 }
11928 else if( k < kend )
11929 {
11930 ElementType value1( A(i ,k) * B(k,j) );
11931 ElementType value2( A(i+1UL,k) * B(k,j) );
11932
11933 for( ++k; k<kend; ++k ) {
11934 value1 += A(i ,k) * B(k,j);
11935 value2 += A(i+1UL,k) * B(k,j);
11936 }
11937
11938 C(i ,j) -= value1 * scalar;
11939 C(i+1UL,j) -= value2 * scalar;
11940 }
11941 }
11942 }
11943
11944 for( ; i<M; ++i )
11945 {
11946 const size_t jend( LOW ? i+1UL : N );
11947 size_t j( UPP ? i : 0UL );
11948
11949 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
11950 {
11951 const size_t kbegin( ( IsUpper_v<MT4> )
11952 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
11953 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
11954 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
11955
11956 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
11957 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
11958
11959 size_t k( kbegin );
11960
11961 if( k < kpos )
11962 {
11963 SIMDType a1( A.load(i,k) );
11964 SIMDType xmm1( a1 * B.load(k,j ) );
11965 SIMDType xmm2( a1 * B.load(k,j+1UL) );
11966 SIMDType xmm3( a1 * B.load(k,j+2UL) );
11967 SIMDType xmm4( a1 * B.load(k,j+3UL) );
11968
11969 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
11970 a1 = A.load(i,k);
11971 xmm1 += a1 * B.load(k,j );
11972 xmm2 += a1 * B.load(k,j+1UL);
11973 xmm3 += a1 * B.load(k,j+2UL);
11974 xmm4 += a1 * B.load(k,j+3UL);
11975 }
11976
11977 C(i,j ) -= sum( xmm1 ) * scalar;
11978 C(i,j+1UL) -= sum( xmm2 ) * scalar;
11979 C(i,j+2UL) -= sum( xmm3 ) * scalar;
11980 C(i,j+3UL) -= sum( xmm4 ) * scalar;
11981
11982 for( ; remainder && k<kend; ++k ) {
11983 C(i,j ) -= A(i,k) * B(k,j ) * scalar;
11984 C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
11985 C(i,j+2UL) -= A(i,k) * B(k,j+2UL) * scalar;
11986 C(i,j+3UL) -= A(i,k) * B(k,j+3UL) * scalar;
11987 }
11988 }
11989 else if( k < kend )
11990 {
11991 ElementType value1( A(i,k) * B(k,j ) );
11992 ElementType value2( A(i,k) * B(k,j+1UL) );
11993 ElementType value3( A(i,k) * B(k,j+2UL) );
11994 ElementType value4( A(i,k) * B(k,j+3UL) );
11995
11996 for( ++k; k<kend; ++k ) {
11997 value1 += A(i,k) * B(k,j );
11998 value2 += A(i,k) * B(k,j+1UL);
11999 value3 += A(i,k) * B(k,j+2UL);
12000 value4 += A(i,k) * B(k,j+3UL);
12001 }
12002
12003 C(i,j ) -= value1 * scalar;
12004 C(i,j+1UL) -= value2 * scalar;
12005 C(i,j+2UL) -= value3 * scalar;
12006 C(i,j+3UL) -= value4 * scalar;
12007 }
12008 }
12009
12010 for( ; (j+2UL) <= jend; j+=2UL )
12011 {
12012 const size_t kbegin( ( IsUpper_v<MT4> )
12013 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
12014 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
12015 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
12016
12017 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
12018 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
12019
12020 size_t k( kbegin );
12021
12022 if( k < kpos )
12023 {
12024 SIMDType a1( A.load(i,k) );
12025 SIMDType xmm1( a1 * B.load(k,j ) );
12026 SIMDType xmm2( a1 * B.load(k,j+1UL) );
12027
12028 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
12029 a1 = A.load(i,k);
12030 xmm1 += a1 * B.load(k,j );
12031 xmm2 += a1 * B.load(k,j+1UL);
12032 }
12033
12034 C(i,j ) -= sum( xmm1 ) * scalar;
12035 C(i,j+1UL) -= sum( xmm2 ) * scalar;
12036
12037 for( ; remainder && k<kend; ++k ) {
12038 C(i,j ) -= A(i,k) * B(k,j ) * scalar;
12039 C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
12040 }
12041 }
12042 else if( k < kend )
12043 {
12044 ElementType value1( A(i,k) * B(k,j ) );
12045 ElementType value2( A(i,k) * B(k,j+1UL) );
12046
12047 for( ++k; k<kend; ++k ) {
12048 value1 += A(i,k) * B(k,j );
12049 value2 += A(i,k) * B(k,j+1UL);
12050 }
12051
12052 C(i,j ) -= value1 * scalar;
12053 C(i,j+1UL) -= value2 * scalar;
12054 }
12055 }
12056
12057 if( j < jend )
12058 {
12059 const size_t kbegin( ( IsUpper_v<MT4> )
12060 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
12061 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
12062
12063 const size_t kpos( remainder ? prevMultiple( K, SIMDSIZE ) : K );
12064 BLAZE_INTERNAL_ASSERT( kpos <= K, "Invalid end calculation" );
12065
12066 size_t k( kbegin );
12067
12068 if( k < kpos )
12069 {
12070 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
12071
12072 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
12073 xmm1 += A.load(i,k) * B.load(k,j);
12074 }
12075
12076 C(i,j) -= sum( xmm1 ) * scalar;
12077
12078 for( ; remainder && k<K; ++k ) {
12079 C(i,j) -= A(i,k) * B(k,j) * scalar;
12080 }
12081 }
12082 else if( k < K )
12083 {
12084 ElementType value( A(i,k) * B(k,j) );
12085
12086 for( ++k; k<K; ++k ) {
12087 value += A(i,k) * B(k,j);
12088 }
12089
12090 C(i,j) -= value * scalar;
12091 }
12092 }
12093 }
12094 }
12095 //**********************************************************************************************
12096
12097 //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
12112 template< typename MT3 // Type of the left-hand side target matrix
12113 , typename MT4 // Type of the left-hand side matrix operand
12114 , typename MT5 // Type of the right-hand side matrix operand
12115 , typename ST2 > // Type of the scalar value
12116 static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12117 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12118 {
12119 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
12120
12121 const size_t M( A.rows() );
12122 const size_t N( B.columns() );
12123 const size_t K( A.columns() );
12124
12125 BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
12126
12127 size_t i( 0UL );
12128
12129 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
12130 {
12131 size_t j( 0UL );
12132
12133 for( ; (j+2UL) <= N; j+=2UL )
12134 {
12135 const size_t kbegin( ( IsUpper_v<MT4> )
12136 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
12137 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
12138 const size_t kend( ( IsLower_v<MT4> )
12139 ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
12140 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
12141
12142 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
12143 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
12144
12145 size_t k( kbegin );
12146
12147 if( k < kpos )
12148 {
12149 SIMDType a1( A.load(i ,k) );
12150 SIMDType a2( A.load(i+1UL,k) );
12151 SIMDType a3( A.load(i+2UL,k) );
12152 SIMDType a4( A.load(i+3UL,k) );
12153 SIMDType b1( B.load(k,j ) );
12154 SIMDType b2( B.load(k,j+1UL) );
12155 SIMDType xmm1( a1 * b1 );
12156 SIMDType xmm2( a1 * b2 );
12157 SIMDType xmm3( a2 * b1 );
12158 SIMDType xmm4( a2 * b2 );
12159 SIMDType xmm5( a3 * b1 );
12160 SIMDType xmm6( a3 * b2 );
12161 SIMDType xmm7( a4 * b1 );
12162 SIMDType xmm8( a4 * b2 );
12163
12164 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE )
12165 {
12166 a1 = A.load(i ,k);
12167 a2 = A.load(i+1UL,k);
12168 a3 = A.load(i+2UL,k);
12169 a4 = A.load(i+3UL,k);
12170 b1 = B.load(k,j );
12171 b2 = B.load(k,j+1UL);
12172 xmm1 += a1 * b1;
12173 xmm2 += a1 * b2;
12174 xmm3 += a2 * b1;
12175 xmm4 += a2 * b2;
12176 xmm5 += a3 * b1;
12177 xmm6 += a3 * b2;
12178 xmm7 += a4 * b1;
12179 xmm8 += a4 * b2;
12180 }
12181
12182 C(i ,j ) -= sum( xmm1 ) * scalar;
12183 C(i ,j+1UL) -= sum( xmm2 ) * scalar;
12184 C(i+1UL,j ) -= sum( xmm3 ) * scalar;
12185 C(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
12186 C(i+2UL,j ) -= sum( xmm5 ) * scalar;
12187 C(i+2UL,j+1UL) -= sum( xmm6 ) * scalar;
12188 C(i+3UL,j ) -= sum( xmm7 ) * scalar;
12189 C(i+3UL,j+1UL) -= sum( xmm8 ) * scalar;
12190
12191 for( ; remainder && k<kend; ++k ) {
12192 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
12193 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
12194 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
12195 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
12196 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
12197 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
12198 C(i+3UL,j ) -= A(i+3UL,k) * B(k,j ) * scalar;
12199 C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL) * scalar;
12200 }
12201 }
12202 else if( k < kend )
12203 {
12204 ElementType value1( A(i ,k) * B(k,j ) );
12205 ElementType value2( A(i ,k) * B(k,j+1UL) );
12206 ElementType value3( A(i+1UL,k) * B(k,j ) );
12207 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
12208 ElementType value5( A(i+2UL,k) * B(k,j ) );
12209 ElementType value6( A(i+2UL,k) * B(k,j+1UL) );
12210 ElementType value7( A(i+3UL,k) * B(k,j ) );
12211 ElementType value8( A(i+3UL,k) * B(k,j+1UL) );
12212
12213 for( ++k; k<kend; ++k ) {
12214 value1 += A(i ,k) * B(k,j );
12215 value2 += A(i ,k) * B(k,j+1UL);
12216 value3 += A(i+1UL,k) * B(k,j );
12217 value4 += A(i+1UL,k) * B(k,j+1UL);
12218 value5 += A(i+2UL,k) * B(k,j );
12219 value6 += A(i+2UL,k) * B(k,j+1UL);
12220 value7 += A(i+3UL,k) * B(k,j );
12221 value8 += A(i+3UL,k) * B(k,j+1UL);
12222 }
12223
12224 C(i ,j ) -= value1 * scalar;
12225 C(i ,j+1UL) -= value2 * scalar;
12226 C(i+1UL,j ) -= value3 * scalar;
12227 C(i+1UL,j+1UL) -= value4 * scalar;
12228 C(i+2UL,j ) -= value5 * scalar;
12229 C(i+2UL,j+1UL) -= value6 * scalar;
12230 C(i+3UL,j ) -= value7 * scalar;
12231 C(i+3UL,j+1UL) -= value8 * scalar;
12232 }
12233 }
12234
12235 if( j < N )
12236 {
12237 const size_t kbegin( ( IsUpper_v<MT4> )
12238 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
12239 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
12240 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
12241
12242 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
12243 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
12244
12245 size_t k( kbegin );
12246
12247 if( k < kpos )
12248 {
12249 SIMDType b1( B.load(k,j) );
12250 SIMDType xmm1( A.load(i ,k) * b1 );
12251 SIMDType xmm2( A.load(i+1UL,k) * b1 );
12252 SIMDType xmm3( A.load(i+2UL,k) * b1 );
12253 SIMDType xmm4( A.load(i+3UL,k) * b1 );
12254
12255 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
12256 b1 = B.load(k,j);
12257 xmm1 += A.load(i ,k) * b1;
12258 xmm2 += A.load(i+1UL,k) * b1;
12259 xmm3 += A.load(i+2UL,k) * b1;
12260 xmm4 += A.load(i+3UL,k) * b1;
12261 }
12262
12263 C(i ,j) -= sum( xmm1 ) * scalar;
12264 C(i+1UL,j) -= sum( xmm2 ) * scalar;
12265 C(i+2UL,j) -= sum( xmm3 ) * scalar;
12266 C(i+3UL,j) -= sum( xmm4 ) * scalar;
12267
12268 for( ; remainder && k<kend; ++k ) {
12269 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
12270 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
12271 C(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
12272 C(i+3UL,j) -= A(i+3UL,k) * B(k,j) * scalar;
12273 }
12274 }
12275 else if( k < kend )
12276 {
12277 ElementType value1( A(i ,k) * B(k,j) );
12278 ElementType value2( A(i+1UL,k) * B(k,j) );
12279 ElementType value3( A(i+2UL,k) * B(k,j) );
12280 ElementType value4( A(i+3UL,k) * B(k,j) );
12281
12282 for( ++k; k<kend; ++k ) {
12283 value1 += A(i ,k) * B(k,j);
12284 value2 += A(i+1UL,k) * B(k,j);
12285 value3 += A(i+2UL,k) * B(k,j);
12286 value4 += A(i+3UL,k) * B(k,j);
12287 }
12288
12289 C(i ,j) -= value1 * scalar;
12290 C(i+1UL,j) -= value2 * scalar;
12291 C(i+2UL,j) -= value3 * scalar;
12292 C(i+3UL,j) -= value4 * scalar;
12293 }
12294 }
12295 }
12296
12297 for( ; !LOW && !UPP && (i+3UL) <= M; i+=3UL )
12298 {
12299 size_t j( 0UL );
12300
12301 for( ; (j+3UL) <= N; j+=3UL )
12302 {
12303 const size_t kbegin( ( IsUpper_v<MT4> )
12304 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
12305 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
12306 const size_t kend( ( IsLower_v<MT4> )
12307 ?( IsUpper_v<MT5> ? min( i+3UL, j+3UL ) : ( i+3UL ) )
12308 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
12309
12310 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
12311 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
12312
12313 size_t k( kbegin );
12314
12315 if( k < kpos )
12316 {
12317 SIMDType a1( A.load(i ,k) );
12318 SIMDType a2( A.load(i+1UL,k) );
12319 SIMDType a3( A.load(i+2UL,k) );
12320 SIMDType b1( B.load(k,j ) );
12321 SIMDType b2( B.load(k,j+1UL) );
12322 SIMDType b3( B.load(k,j+2UL) );
12323 SIMDType xmm1( a1 * b1 );
12324 SIMDType xmm2( a1 * b2 );
12325 SIMDType xmm3( a1 * b3 );
12326 SIMDType xmm4( a2 * b1 );
12327 SIMDType xmm5( a2 * b2 );
12328 SIMDType xmm6( a2 * b3 );
12329 SIMDType xmm7( a3 * b1 );
12330 SIMDType xmm8( a3 * b2 );
12331 SIMDType xmm9( a3 * b3 );
12332
12333 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE )
12334 {
12335 a1 = A.load(i ,k);
12336 a2 = A.load(i+1UL,k);
12337 a3 = A.load(i+2UL,k);
12338 b1 = B.load(k,j );
12339 b2 = B.load(k,j+1UL);
12340 b3 = B.load(k,j+2UL);
12341 xmm1 += a1 * b1;
12342 xmm2 += a1 * b2;
12343 xmm3 += a1 * b3;
12344 xmm4 += a2 * b1;
12345 xmm5 += a2 * b2;
12346 xmm6 += a2 * b3;
12347 xmm7 += a3 * b1;
12348 xmm8 += a3 * b2;
12349 xmm9 += a3 * b3;
12350 }
12351
12352 C(i ,j ) -= sum( xmm1 ) * scalar;
12353 C(i ,j+1UL) -= sum( xmm2 ) * scalar;
12354 C(i ,j+2UL) -= sum( xmm3 ) * scalar;
12355 C(i+1UL,j ) -= sum( xmm4 ) * scalar;
12356 C(i+1UL,j+1UL) -= sum( xmm5 ) * scalar;
12357 C(i+1UL,j+2UL) -= sum( xmm6 ) * scalar;
12358 C(i+2UL,j ) -= sum( xmm7 ) * scalar;
12359 C(i+2UL,j+1UL) -= sum( xmm8 ) * scalar;
12360 C(i+2UL,j+2UL) -= sum( xmm9 ) * scalar;
12361
12362 for( ; remainder && k<kend; ++k ) {
12363 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
12364 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
12365 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
12366 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
12367 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
12368 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
12369 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
12370 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
12371 C(i+2UL,j+2UL) -= A(i+2UL,k) * B(k,j+2UL) * scalar;
12372 }
12373 }
12374 else if( k < kend )
12375 {
12376 ElementType value1( A(i ,k) * B(k,j ) );
12377 ElementType value2( A(i ,k) * B(k,j+1UL) );
12378 ElementType value3( A(i ,k) * B(k,j+2UL) );
12379 ElementType value4( A(i+1UL,k) * B(k,j ) );
12380 ElementType value5( A(i+1UL,k) * B(k,j+1UL) );
12381 ElementType value6( A(i+1UL,k) * B(k,j+2UL) );
12382 ElementType value7( A(i+2UL,k) * B(k,j ) );
12383 ElementType value8( A(i+2UL,k) * B(k,j+1UL) );
12384 ElementType value9( A(i+2UL,k) * B(k,j+2UL) );
12385
12386 for( ++k; k<kend; ++k ) {
12387 value1 += A(i ,k) * B(k,j );
12388 value2 += A(i ,k) * B(k,j+1UL);
12389 value3 += A(i ,k) * B(k,j+2UL);
12390 value4 += A(i+1UL,k) * B(k,j );
12391 value5 += A(i+1UL,k) * B(k,j+1UL);
12392 value6 += A(i+1UL,k) * B(k,j+2UL);
12393 value7 += A(i+2UL,k) * B(k,j );
12394 value8 += A(i+2UL,k) * B(k,j+1UL);
12395 value9 += A(i+2UL,k) * B(k,j+2UL);
12396 }
12397
12398 C(i ,j ) -= value1 * scalar;
12399 C(i ,j+1UL) -= value2 * scalar;
12400 C(i ,j+2UL) -= value3 * scalar;
12401 C(i+1UL,j ) -= value4 * scalar;
12402 C(i+1UL,j+1UL) -= value5 * scalar;
12403 C(i+1UL,j+2UL) -= value6 * scalar;
12404 C(i+2UL,j ) -= value7 * scalar;
12405 C(i+2UL,j+1UL) -= value8 * scalar;
12406 C(i+2UL,j+2UL) -= value9 * scalar;
12407 }
12408 }
12409
12410 for( ; (j+2UL) <= N; j+=2UL )
12411 {
12412 const size_t kbegin( ( IsUpper_v<MT4> )
12413 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
12414 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
12415 const size_t kend( ( IsLower_v<MT4> )
12416 ?( IsUpper_v<MT5> ? min( i+3UL, j+2UL ) : ( i+3UL ) )
12417 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
12418
12419 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
12420 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
12421
12422 size_t k( kbegin );
12423
12424 if( k < kpos )
12425 {
12426 SIMDType a1( A.load(i ,k) );
12427 SIMDType a2( A.load(i+1UL,k) );
12428 SIMDType a3( A.load(i+2UL,k) );
12429 SIMDType b1( B.load(k,j ) );
12430 SIMDType b2( B.load(k,j+1UL) );
12431 SIMDType xmm1( a1 * b1 );
12432 SIMDType xmm2( a1 * b2 );
12433 SIMDType xmm3( a2 * b1 );
12434 SIMDType xmm4( a2 * b2 );
12435 SIMDType xmm5( a3 * b1 );
12436 SIMDType xmm6( a3 * b2 );
12437
12438 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE )
12439 {
12440 a1 = A.load(i ,k);
12441 a2 = A.load(i+1UL,k);
12442 a3 = A.load(i+2UL,k);
12443 b1 = B.load(k,j );
12444 b2 = B.load(k,j+1UL);
12445 xmm1 += a1 * b1;
12446 xmm2 += a1 * b2;
12447 xmm3 += a2 * b1;
12448 xmm4 += a2 * b2;
12449 xmm5 += a3 * b1;
12450 xmm6 += a3 * b2;
12451 }
12452
12453 C(i ,j ) -= sum( xmm1 ) * scalar;
12454 C(i ,j+1UL) -= sum( xmm2 ) * scalar;
12455 C(i+1UL,j ) -= sum( xmm3 ) * scalar;
12456 C(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
12457 C(i+2UL,j ) -= sum( xmm5 ) * scalar;
12458 C(i+2UL,j+1UL) -= sum( xmm6 ) * scalar;
12459
12460 for( ; remainder && k<kend; ++k ) {
12461 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
12462 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
12463 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
12464 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
12465 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
12466 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
12467 }
12468 }
12469 else if( k < kend )
12470 {
12471 ElementType value1( A(i ,k) * B(k,j ) );
12472 ElementType value2( A(i ,k) * B(k,j+1UL) );
12473 ElementType value3( A(i+1UL,k) * B(k,j ) );
12474 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
12475 ElementType value5( A(i+2UL,k) * B(k,j ) );
12476 ElementType value6( A(i+2UL,k) * B(k,j+1UL) );
12477
12478 for( ++k; k<kend; ++k ) {
12479 value1 += A(i ,k) * B(k,j );
12480 value2 += A(i ,k) * B(k,j+1UL);
12481 value3 += A(i+1UL,k) * B(k,j );
12482 value4 += A(i+1UL,k) * B(k,j+1UL);
12483 value5 += A(i+2UL,k) * B(k,j );
12484 value6 += A(i+2UL,k) * B(k,j+1UL);
12485 }
12486
12487 C(i ,j ) -= value1 * scalar;
12488 C(i ,j+1UL) -= value2 * scalar;
12489 C(i+1UL,j ) -= value3 * scalar;
12490 C(i+1UL,j+1UL) -= value4 * scalar;
12491 C(i+2UL,j ) -= value5 * scalar;
12492 C(i+2UL,j+1UL) -= value6 * scalar;
12493 }
12494 }
12495
12496 if( j < N )
12497 {
12498 const size_t kbegin( ( IsUpper_v<MT4> )
12499 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
12500 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
12501 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
12502
12503 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
12504 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
12505
12506 size_t k( kbegin );
12507
12508 if( k < kpos )
12509 {
12510 SIMDType b1( B.load(k,j) );
12511 SIMDType xmm1( A.load(i ,k) * b1 );
12512 SIMDType xmm2( A.load(i+1UL,k) * b1 );
12513 SIMDType xmm3( A.load(i+2UL,k) * b1 );
12514
12515 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
12516 b1 = B.load(k,j);
12517 xmm1 += A.load(i ,k) * b1;
12518 xmm2 += A.load(i+1UL,k) * b1;
12519 xmm3 += A.load(i+2UL,k) * b1;
12520 }
12521
12522 C(i ,j) -= sum( xmm1 ) * scalar;
12523 C(i+1UL,j) -= sum( xmm2 ) * scalar;
12524 C(i+2UL,j) -= sum( xmm3 ) * scalar;
12525
12526 for( ; remainder && k<kend; ++k ) {
12527 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
12528 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
12529 C(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
12530 }
12531 }
12532 else if( k < kend )
12533 {
12534 ElementType value1( A(i ,k) * B(k,j) );
12535 ElementType value2( A(i+1UL,k) * B(k,j) );
12536 ElementType value3( A(i+2UL,k) * B(k,j) );
12537
12538 for( ++k; k<kend; ++k ) {
12539 value1 += A(i ,k) * B(k,j);
12540 value2 += A(i+1UL,k) * B(k,j);
12541 value3 += A(i+2UL,k) * B(k,j);
12542 }
12543
12544 C(i ,j) -= value1 * scalar;
12545 C(i+1UL,j) -= value2 * scalar;
12546 C(i+2UL,j) -= value3 * scalar;
12547 }
12548 }
12549 }
12550
12551 for( ; (i+2UL) <= M; i+=2UL )
12552 {
12553 const size_t jend( LOW ? i+2UL : N );
12554 size_t j( UPP ? i : 0UL );
12555
12556 for( ; (j+2UL) <= jend; j+=2UL )
12557 {
12558 const size_t kbegin( ( IsUpper_v<MT4> )
12559 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
12560 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
12561 const size_t kend( ( IsLower_v<MT4> )
12562 ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
12563 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
12564
12565 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
12566 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
12567
12568 size_t k( kbegin );
12569
12570 if( k < kpos )
12571 {
12572 SIMDType a1( A.load(i ,k) );
12573 SIMDType a2( A.load(i+1UL,k) );
12574 SIMDType b1( B.load(k,j ) );
12575 SIMDType b2( B.load(k,j+1UL) );
12576 SIMDType xmm1( a1 * b1 );
12577 SIMDType xmm2( a1 * b2 );
12578 SIMDType xmm3( a2 * b1 );
12579 SIMDType xmm4( a2 * b2 );
12580
12581 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
12582 a1 = A.load(i ,k);
12583 a2 = A.load(i+1UL,k);
12584 b1 = B.load(k,j );
12585 b2 = B.load(k,j+1UL);
12586 xmm1 += a1 * b1;
12587 xmm2 += a1 * b2;
12588 xmm3 += a2 * b1;
12589 xmm4 += a2 * b2;
12590 }
12591
12592 C(i ,j ) -= sum( xmm1 ) * scalar;
12593 C(i ,j+1UL) -= sum( xmm2 ) * scalar;
12594 C(i+1UL,j ) -= sum( xmm3 ) * scalar;
12595 C(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
12596
12597 for( ; remainder && k<kend; ++k ) {
12598 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
12599 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
12600 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
12601 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
12602 }
12603 }
12604 else if( k < kend )
12605 {
12606 ElementType value1( A(i ,k) * B(k,j ) );
12607 ElementType value2( A(i ,k) * B(k,j+1UL) );
12608 ElementType value3( A(i+1UL,k) * B(k,j ) );
12609 ElementType value4( A(i+1UL,k) * B(k,j+1UL) );
12610
12611 for( ++k; k<kend; ++k ) {
12612 value1 += A(i ,k) * B(k,j );
12613 value2 += A(i ,k) * B(k,j+1UL);
12614 value3 += A(i+1UL,k) * B(k,j );
12615 value4 += A(i+1UL,k) * B(k,j+1UL);
12616 }
12617
12618 C(i ,j ) -= value1 * scalar;
12619 C(i ,j+1UL) -= value2 * scalar;
12620 C(i+1UL,j ) -= value3 * scalar;
12621 C(i+1UL,j+1UL) -= value4 * scalar;
12622 }
12623 }
12624
12625 if( j < jend )
12626 {
12627 const size_t kbegin( ( IsUpper_v<MT4> )
12628 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
12629 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
12630 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
12631
12632 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
12633 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
12634
12635 size_t k( kbegin );
12636
12637 if( k < kpos )
12638 {
12639 SIMDType b1( B.load(k,j) );
12640 SIMDType xmm1( A.load(i ,k) * b1 );
12641 SIMDType xmm2( A.load(i+1UL,k) * b1 );
12642
12643 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
12644 b1 = B.load(k,j);
12645 xmm1 += A.load(i ,k) * b1;
12646 xmm2 += A.load(i+1UL,k) * b1;
12647 }
12648
12649 C(i ,j) -= sum( xmm1 ) * scalar;
12650 C(i+1UL,j) -= sum( xmm2 ) * scalar;
12651
12652 for( ; remainder && k<kend; ++k ) {
12653 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
12654 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
12655 }
12656 }
12657 else if( k < kend )
12658 {
12659 ElementType value1( A(i ,k) * B(k,j) );
12660 ElementType value2( A(i+1UL,k) * B(k,j) );
12661
12662 for( ++k; k<kend; ++k ) {
12663 value1 += A(i ,k) * B(k,j);
12664 value2 += A(i+1UL,k) * B(k,j);
12665 }
12666
12667 C(i ,j) -= value1 * scalar;
12668 C(i+1UL,j) -= value2 * scalar;
12669 }
12670 }
12671 }
12672
12673 if( i < M )
12674 {
12675 const size_t jend( LOW ? i+1UL : N );
12676 size_t j( UPP ? i : 0UL );
12677
12678 for( ; (j+2UL) <= jend; j+=2UL )
12679 {
12680 const size_t kbegin( ( IsUpper_v<MT4> )
12681 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
12682 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
12683 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
12684
12685 const size_t kpos( remainder ? prevMultiple( kend, SIMDSIZE ) : kend );
12686 BLAZE_INTERNAL_ASSERT( kpos <= kend, "Invalid end calculation" );
12687
12688 size_t k( kbegin );
12689
12690 if( k < kpos )
12691 {
12692 SIMDType a1( A.load(i,k) );
12693 SIMDType xmm1( a1 * B.load(k,j ) );
12694 SIMDType xmm2( a1 * B.load(k,j+1UL) );
12695
12696 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
12697 a1 = A.load(i,k);
12698 xmm1 += a1 * B.load(k,j );
12699 xmm2 += a1 * B.load(k,j+1UL);
12700 }
12701
12702 C(i,j ) -= sum( xmm1 ) * scalar;
12703 C(i,j+1UL) -= sum( xmm2 ) * scalar;
12704
12705 for( ; remainder && k<kend; ++k ) {
12706 C(i,j ) -= A(i,k) * B(k,j ) * scalar;
12707 C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
12708 }
12709 }
12710 else if( k < kend )
12711 {
12712 ElementType value1( A(i,k) * B(k,j ) );
12713 ElementType value2( A(i,k) * B(k,j+1UL) );
12714
12715 for( ++k; k<kend; ++k ) {
12716 value1 += A(i,k) * B(k,j );
12717 value2 += A(i,k) * B(k,j+1UL);
12718 }
12719
12720 C(i,j ) -= value1 * scalar;
12721 C(i,j+1UL) -= value2 * scalar;
12722 }
12723 }
12724
12725 if( j < jend )
12726 {
12727 const size_t kbegin( ( IsUpper_v<MT4> )
12728 ?( prevMultiple( ( IsLower_v<MT5> ? max( i, j ) : i ), SIMDSIZE ) )
12729 :( IsLower_v<MT5> ? prevMultiple( j, SIMDSIZE ) : 0UL ) );
12730
12731 const size_t kpos( remainder ? prevMultiple( K, SIMDSIZE ) : K );
12732 BLAZE_INTERNAL_ASSERT( kpos <= K, "Invalid end calculation" );
12733
12734 size_t k( kbegin );
12735
12736 if( k < kpos )
12737 {
12738 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
12739
12740 for( k+=SIMDSIZE; k<kpos; k+=SIMDSIZE ) {
12741 xmm1 += A.load(i,k) * B.load(k,j);
12742 }
12743
12744 C(i,j) -= sum( xmm1 ) * scalar;
12745
12746 for( ; remainder && k<K; ++k ) {
12747 C(i,j) -= A(i,k) * B(k,j) * scalar;
12748 }
12749 }
12750 else if( k < K )
12751 {
12752 ElementType value( A(i,k) * B(k,j) );
12753
12754 for( ++k; k<K; ++k ) {
12755 value += A(i,k) * B(k,j);
12756 }
12757
12758 C(i,j) -= value * scalar;
12759 }
12760 }
12761 }
12762 }
12763 //**********************************************************************************************
12764
12765 //**Default subtraction assignment to dense matrices (large matrices)***************************
12779 template< typename MT3 // Type of the left-hand side target matrix
12780 , typename MT4 // Type of the left-hand side matrix operand
12781 , typename MT5 // Type of the right-hand side matrix operand
12782 , typename ST2 > // Type of the scalar value
12783 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12784 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12785 {
12786 selectDefaultSubAssignKernel( C, A, B, scalar );
12787 }
12788 //**********************************************************************************************
12789
12790 //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
12805 template< typename MT3 // Type of the left-hand side target matrix
12806 , typename MT4 // Type of the left-hand side matrix operand
12807 , typename MT5 // Type of the right-hand side matrix operand
12808 , typename ST2 > // Type of the scalar value
12809 static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12810 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12811 {
12812 if( LOW )
12813 lmmm( C, A, B, -scalar, ST2(1) );
12814 else if( UPP )
12815 ummm( C, A, B, -scalar, ST2(1) );
12816 else
12817 mmm( C, A, B, -scalar, ST2(1) );
12818 }
12819 //**********************************************************************************************
12820
12821 //**BLAS-based subtraction assignment to dense matrices (default)*******************************
12835 template< typename MT3 // Type of the left-hand side target matrix
12836 , typename MT4 // Type of the left-hand side matrix operand
12837 , typename MT5 // Type of the right-hand side matrix operand
12838 , typename ST2 > // Type of the scalar value
12839 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12840 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
12841 {
12842 selectLargeSubAssignKernel( C, A, B, scalar );
12843 }
12844 //**********************************************************************************************
12845
12846 //**BLAS-based subraction assignment to dense matrices******************************************
12847#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
12861 template< typename MT3 // Type of the left-hand side target matrix
12862 , typename MT4 // Type of the left-hand side matrix operand
12863 , typename MT5 // Type of the right-hand side matrix operand
12864 , typename ST2 > // Type of the scalar value
12865 static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12866 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
12867 {
12868 using ET = ElementType_t<MT3>;
12869
12870 if( IsTriangular_v<MT4> ) {
12871 ResultType_t<MT3> tmp( serial( B ) );
12872 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
12873 subAssign( C, tmp );
12874 }
12875 else if( IsTriangular_v<MT5> ) {
12876 ResultType_t<MT3> tmp( serial( A ) );
12877 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
12878 subAssign( C, tmp );
12879 }
12880 else {
12881 gemm( C, A, B, ET(-scalar), ET(1) );
12882 }
12883 }
12884#endif
12885 //**********************************************************************************************
12886
12887 //**Subtraction assignment to sparse matrices***************************************************
12888 // No special implementation for the subtraction assignment to sparse matrices.
12889 //**********************************************************************************************
12890
12891 //**Schur product assignment to dense matrices**************************************************
12903 template< typename MT // Type of the target dense matrix
12904 , bool SO > // Storage order of the target dense matrix
12905 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
12906 {
12908
12912
12913 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
12914 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
12915
12916 const ResultType tmp( serial( rhs ) );
12917 schurAssign( *lhs, tmp );
12918 }
12919 //**********************************************************************************************
12920
12921 //**Schur product assignment to sparse matrices*************************************************
12922 // No special implementation for the Schur product assignment to sparse matrices.
12923 //**********************************************************************************************
12924
12925 //**Multiplication assignment to dense matrices*************************************************
12926 // No special implementation for the multiplication assignment to dense matrices.
12927 //**********************************************************************************************
12928
12929 //**Multiplication assignment to sparse matrices************************************************
12930 // No special implementation for the multiplication assignment to sparse matrices.
12931 //**********************************************************************************************
12932
12933 //**SMP assignment to dense matrices************************************************************
12948 template< typename MT // Type of the target dense matrix
12949 , bool SO > // Storage order of the target dense matrix
12950 friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
12951 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
12952 {
12954
12955 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
12956 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
12957
12958 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
12959 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
12960
12961 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
12962 return;
12963 }
12964 else if( left.columns() == 0UL ) {
12965 reset( *lhs );
12966 return;
12967 }
12968
12969 LT A( left ); // Evaluation of the left-hand side dense matrix operand
12970 RT B( right ); // Evaluation of the right-hand side dense matrix operand
12971
12972 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
12973 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
12974 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
12975 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
12976 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
12977 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
12978
12979 smpAssign( *lhs, A * B * rhs.scalar_ );
12980 }
12981 //**********************************************************************************************
12982
12983 //**SMP assignment to sparse matrices***********************************************************
12998 template< typename MT // Type of the target sparse matrix
12999 , bool SO > // Storage order of the target sparse matrix
13000 friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13001 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13002 {
13004
13005 using TmpType = If_t< SO, OppositeType, ResultType >;
13006
13013
13014 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
13015 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
13016
13017 const ForwardFunctor fwd;
13018
13019 const TmpType tmp( rhs );
13020 smpAssign( *lhs, fwd( tmp ) );
13021 }
13022 //**********************************************************************************************
13023
13024 //**SMP addition assignment to dense matrices***************************************************
13039 template< typename MT // Type of the target dense matrix
13040 , bool SO > // Storage order of the target dense matrix
13041 friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13042 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13043 {
13045
13046 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
13047 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
13048
13049 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13050 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13051
13052 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
13053 return;
13054 }
13055
13056 LT A( left ); // Evaluation of the left-hand side dense matrix operand
13057 RT B( right ); // Evaluation of the right-hand side dense matrix operand
13058
13059 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
13060 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
13061 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
13062 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
13063 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
13064 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
13065
13066 smpAddAssign( *lhs, A * B * rhs.scalar_ );
13067 }
13068 //**********************************************************************************************
13069
13070 //**SMP addition assignment to sparse matrices**************************************************
13071 // No special implementation for the SMP addition assignment to sparse matrices.
13072 //**********************************************************************************************
13073
13074 //**SMP subtraction assignment to dense matrices************************************************
13089 template< typename MT // Type of the target dense matrix
13090 , bool SO > // Storage order of the target dense matrix
13091 friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13092 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13093 {
13095
13096 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
13097 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
13098
13099 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13100 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13101
13102 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
13103 return;
13104 }
13105
13106 LT A( left ); // Evaluation of the left-hand side dense matrix operand
13107 RT B( right ); // Evaluation of the right-hand side dense matrix operand
13108
13109 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
13110 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
13111 BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
13112 BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
13113 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).rows() , "Invalid number of rows" );
13114 BLAZE_INTERNAL_ASSERT( B.columns() == (*lhs).columns(), "Invalid number of columns" );
13115
13116 smpSubAssign( *lhs, A * B * rhs.scalar_ );
13117 }
13118 //**********************************************************************************************
13119
13120 //**SMP subtraction assignment to sparse matrices***********************************************
13121 // No special implementation for the SMP subtraction assignment to sparse matrices.
13122 //**********************************************************************************************
13123
13124 //**SMP Schur product assignment to dense matrices**********************************************
13136 template< typename MT // Type of the target dense matrix
13137 , bool SO > // Storage order of the target dense matrix
13138 friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13139 {
13141
13145
13146 BLAZE_INTERNAL_ASSERT( (*lhs).rows() == rhs.rows() , "Invalid number of rows" );
13147 BLAZE_INTERNAL_ASSERT( (*lhs).columns() == rhs.columns(), "Invalid number of columns" );
13148
13149 const ResultType tmp( rhs );
13150 smpSchurAssign( *lhs, tmp );
13151 }
13152 //**********************************************************************************************
13153
13154 //**SMP Schur product assignment to sparse matrices*********************************************
13155 // No special implementation for the SMP Schur product assignment to sparse matrices.
13156 //**********************************************************************************************
13157
13158 //**SMP multiplication assignment to dense matrices*********************************************
13159 // No special implementation for the SMP multiplication assignment to dense matrices.
13160 //**********************************************************************************************
13161
13162 //**SMP multiplication assignment to sparse matrices********************************************
13163 // No special implementation for the SMP multiplication assignment to sparse matrices.
13164 //**********************************************************************************************
13165
13166 //**Compile time checks*************************************************************************
13175 //**********************************************************************************************
13176};
13178//*************************************************************************************************
13179
13180
13181
13182
13183//=================================================================================================
13184//
13185// GLOBAL BINARY ARITHMETIC OPERATORS
13186//
13187//=================================================================================================
13188
13189//*************************************************************************************************
13219template< typename MT1 // Type of the left-hand side dense matrix
13220 , typename MT2 > // Type of the right-hand side dense matrix
13221inline decltype(auto)
13222 operator*( const DenseMatrix<MT1,false>& lhs, const DenseMatrix<MT2,true>& rhs )
13223{
13225
13226 if( (*lhs).columns() != (*rhs).rows() ) {
13227 BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
13228 }
13229
13231 return ReturnType( *lhs, *rhs );
13232}
13233//*************************************************************************************************
13234
13235
13236
13237
13238//=================================================================================================
13239//
13240// GLOBAL FUNCTIONS
13241//
13242//=================================================================================================
13243
13244//*************************************************************************************************
13269template< typename MT1 // Type of the left-hand side dense matrix
13270 , typename MT2 // Type of the right-hand side dense matrix
13271 , bool SF // Symmetry flag
13272 , bool HF // Hermitian flag
13273 , bool LF // Lower flag
13274 , bool UF > // Upper flag
13275inline decltype(auto) declsym( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13276{
13278
13279 if( !isSquare( dm ) ) {
13280 BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
13281 }
13282
13283 using ReturnType = const DMatTDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
13284 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13285}
13287//*************************************************************************************************
13288
13289
13290//*************************************************************************************************
13315template< typename MT1 // Type of the left-hand side dense matrix
13316 , typename MT2 // Type of the right-hand side dense matrix
13317 , bool SF // Symmetry flag
13318 , bool HF // Hermitian flag
13319 , bool LF // Lower flag
13320 , bool UF > // Upper flag
13321inline decltype(auto) declherm( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13322{
13324
13325 if( !isSquare( dm ) ) {
13326 BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
13327 }
13328
13329 using ReturnType = const DMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
13330 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13331}
13333//*************************************************************************************************
13334
13335
13336//*************************************************************************************************
13361template< typename MT1 // Type of the left-hand side dense matrix
13362 , typename MT2 // Type of the right-hand side dense matrix
13363 , bool SF // Symmetry flag
13364 , bool HF // Hermitian flag
13365 , bool LF // Lower flag
13366 , bool UF > // Upper flag
13367inline decltype(auto) decllow( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13368{
13370
13371 if( !isSquare( dm ) ) {
13372 BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
13373 }
13374
13375 using ReturnType = const DMatTDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
13376 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13377}
13379//*************************************************************************************************
13380
13381
13382//*************************************************************************************************
13407template< typename MT1 // Type of the left-hand side dense matrix
13408 , typename MT2 // Type of the right-hand side dense matrix
13409 , bool SF // Symmetry flag
13410 , bool HF // Hermitian flag
13411 , bool UF > // Upper flag
13412inline decltype(auto) declunilow( const DMatTDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
13413{
13415
13416 if( !isSquare( dm ) ) {
13417 BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
13418 }
13419
13420 return declunilow( decllow( *dm ) );
13421}
13423//*************************************************************************************************
13424
13425
13426//*************************************************************************************************
13451template< typename MT1 // Type of the left-hand side dense matrix
13452 , typename MT2 // Type of the right-hand side dense matrix
13453 , bool SF // Symmetry flag
13454 , bool HF // Hermitian flag
13455 , bool UF > // Upper flag
13456inline decltype(auto) declstrlow( const DMatTDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
13457{
13459
13460 if( !isSquare( dm ) ) {
13461 BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
13462 }
13463
13464 return declstrlow( decllow( *dm ) );
13465}
13467//*************************************************************************************************
13468
13469
13470//*************************************************************************************************
13495template< typename MT1 // Type of the left-hand side dense matrix
13496 , typename MT2 // Type of the right-hand side dense matrix
13497 , bool SF // Symmetry flag
13498 , bool HF // Hermitian flag
13499 , bool LF // Lower flag
13500 , bool UF > // Upper flag
13501inline decltype(auto) declupp( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13502{
13504
13505 if( !isSquare( dm ) ) {
13506 BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
13507 }
13508
13509 using ReturnType = const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
13510 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13511}
13513//*************************************************************************************************
13514
13515
13516//*************************************************************************************************
13541template< typename MT1 // Type of the left-hand side dense matrix
13542 , typename MT2 // Type of the right-hand side dense matrix
13543 , bool SF // Symmetry flag
13544 , bool HF // Hermitian flag
13545 , bool LF > // Lower flag
13546inline decltype(auto) decluniupp( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
13547{
13549
13550 if( !isSquare( dm ) ) {
13551 BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
13552 }
13553
13554 return decluniupp( declupp( *dm ) );
13555}
13557//*************************************************************************************************
13558
13559
13560//*************************************************************************************************
13585template< typename MT1 // Type of the left-hand side dense matrix
13586 , typename MT2 // Type of the right-hand side dense matrix
13587 , bool SF // Symmetry flag
13588 , bool HF // Hermitian flag
13589 , bool LF > // Lower flag
13590inline decltype(auto) declstrupp( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
13591{
13593
13594 if( !isSquare( dm ) ) {
13595 BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
13596 }
13597
13598 return declstrupp( declupp( *dm ) );
13599}
13601//*************************************************************************************************
13602
13603
13604//*************************************************************************************************
13629template< typename MT1 // Type of the left-hand side dense matrix
13630 , typename MT2 // Type of the right-hand side dense matrix
13631 , bool SF // Symmetry flag
13632 , bool HF // Hermitian flag
13633 , bool LF // Lower flag
13634 , bool UF > // Upper flag
13635inline decltype(auto) decldiag( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13636{
13638
13639 if( !isSquare( dm ) ) {
13640 BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
13641 }
13642
13643 using ReturnType = const DMatTDMatMultExpr<MT1,MT2,SF,HF,true,true>;
13644 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13645}
13647//*************************************************************************************************
13648
13649
13650
13651
13652//=================================================================================================
13653//
13654// SIZE SPECIALIZATIONS
13655//
13656//=================================================================================================
13657
13658//*************************************************************************************************
13660template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13661struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
13662 : public Size<MT1,0UL>
13663{};
13664
13665template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13666struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
13667 : public Size<MT2,1UL>
13668{};
13670//*************************************************************************************************
13671
13672
13673
13674
13675//=================================================================================================
13676//
13677// ISALIGNED SPECIALIZATIONS
13678//
13679//=================================================================================================
13680
13681//*************************************************************************************************
13683template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13684struct IsAligned< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13685 : public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
13686{};
13688//*************************************************************************************************
13689
13690} // namespace blaze
13691
13692#endif
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.
Definition: Aliases.h:310
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for kernel specific block sizes.
Header file for the blaze::checked and blaze::unchecked instances.
Constraints on the storage order of matrix types.
Header file for the complex data type.
Header file for the conjugate shim.
Header file for the decldiag trait.
Header file for the DeclDiag functor.
Header file for the declherm trait.
Header file for the DeclHerm functor.
Header file for the decllow trait.
Header file for the DeclLow functor.
Header file for the declsym trait.
Header file for the DeclSym functor.
Header file for the declupp trait.
Header file for the DeclUpp functor.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsColumnMajorMatrix type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsRowMajorMatrix type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Header file for the dense matrix multiplication kernels.
Header file for the multiplication trait.
Header file for the Noop functor.
Header file for the prevMultiple shim.
Constraints on the storage order of matrix types.
Header file for all SIMD functionality.
Data type constraint.
Constraint on the data type.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:592
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:548
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:170
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:108
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:602
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:167
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:176
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:159
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:570
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:538
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
MatScalarMultExpr< DenseMatrix< This, SO > > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:162
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:179
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:173
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:611
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:427
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:558
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:437
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:446
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:582
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:528
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:459
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:166
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:610
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:432
Expression object for dense matrix-transpose dense matrix multiplications.
Definition: DMatTDMatMultExpr.h:146
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:443
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:159
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:419
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:152
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:270
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:409
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatTDMatMultExpr.h:268
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:282
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:453
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:154
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:151
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:269
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:476
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:431
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:399
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:149
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:266
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:153
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:265
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:324
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:389
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:273
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatTDMatMultExpr.h:373
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:279
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: DMatTDMatMultExpr.h:169
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:267
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:164
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:150
static constexpr bool LOW
Flag for lower matrices.
Definition: DMatTDMatMultExpr.h:170
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatTDMatMultExpr.h:287
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:475
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:309
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:263
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatTDMatMultExpr.h:294
static constexpr bool SYM
Flag for symmetric matrices.
Definition: DMatTDMatMultExpr.h:168
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatTDMatMultExpr.h:300
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:276
static constexpr bool UPP
Flag for upper matrices.
Definition: DMatTDMatMultExpr.h:171
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:463
Base class for dense matrices.
Definition: DenseMatrix.h:82
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseMatrix base class.
Header file for the MatMatMultExpr base class.
Header file for the MatScalarMultExpr base class.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1339
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1375
decltype(auto) declstrupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly upper.
Definition: DMatDeclStrUppExpr.h:1003
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1464
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:978
decltype(auto) declstrlow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly lower.
Definition: DMatDeclStrLowExpr.h:1003
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1004
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1004
decltype(auto) decluniupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as uniupper.
Definition: DMatDeclUniUppExpr.h:1005
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1005
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1005
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2156
decltype(auto) declunilow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as unilower.
Definition: DMatDeclUniLowExpr.h:1004
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: RowMajorMatrix.h:61
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.
Definition: StorageOrder.h:84
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.
Definition: MatMatMultExpr.h:103
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: ColumnMajorMatrix.h:61
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:584
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:1383
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:137
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
Header file for the exception macros of the math module.
Constraints on the storage order of matrix types.
Header file for all forward declarations for expression class templates.
Header file for the Size type trait.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:127
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:61
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:126
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:61
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:126
Generic wrapper for the decllow() function.
Definition: DeclLow.h:61
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:126
Generic wrapper for the declsym() function.
Definition: DeclSym.h:61
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:126
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:61
Base class for all matrix/matrix multiplication expression templates.
Definition: MatMatMultExpr.h:71
Base template for the MultTrait class.
Definition: MultTrait.h:130
Generic wrapper for the null function.
Definition: Noop.h:62
System settings for the BLAS mode.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.
Header file for the generic max algorithm.
Header file for the generic min algorithm.