Blaze 3.9
TDVecDMatMultExpr.h
Go to the documentation of this file.
1//=================================================================================================
33//=================================================================================================
34
35#ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
37
38
39//*************************************************************************************************
40// Includes
41//*************************************************************************************************
42
45#include <blaze/math/Aliases.h>
64#include <blaze/math/SIMD.h>
86#include <blaze/system/BLAS.h>
92#include <blaze/util/Assert.h>
93#include <blaze/util/Complex.h>
95#include <blaze/util/EnableIf.h>
98#include <blaze/util/mpl/If.h>
99#include <blaze/util/Types.h>
107
108
109namespace blaze {
110
111//=================================================================================================
112//
113// CLASS TDVECDMATMULTEXPR
114//
115//=================================================================================================
116
117//*************************************************************************************************
124template< typename VT // Type of the left-hand side dense vector
125 , typename MT > // Type of the right-hand side dense matrix
127 : public TVecMatMultExpr< DenseVector< TDVecDMatMultExpr<VT,MT>, true > >
128 , private Computation
129{
130 private:
131 //**Type definitions****************************************************************************
138 //**********************************************************************************************
139
140 //**********************************************************************************************
142 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
143 //**********************************************************************************************
144
145 //**********************************************************************************************
147 static constexpr bool evaluateMatrix =
148 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
149 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
150 //**********************************************************************************************
151
152 //**********************************************************************************************
154
158 template< typename T1 >
159 static constexpr bool UseSMPAssign_v = ( evaluateVector || evaluateMatrix );
161 //**********************************************************************************************
162
163 //**********************************************************************************************
165
168 template< typename T1, typename T2, typename T3 >
169 static constexpr bool UseBlasKernel_v =
171 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
172 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
173 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
174 !IsDiagonal_v<T3> &&
175 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
176 IsBLASCompatible_v< ElementType_t<T1> > &&
177 IsBLASCompatible_v< ElementType_t<T2> > &&
178 IsBLASCompatible_v< ElementType_t<T3> > &&
179 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
180 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
182 //**********************************************************************************************
183
184 //**********************************************************************************************
186
190 template< typename T1, typename T2, typename T3 >
191 static constexpr bool UseVectorizedDefaultKernel_v =
192 ( useOptimizedKernels &&
193 !IsDiagonal_v<T3> &&
194 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
195 IsSIMDCombinable_v< ElementType_t<T1>
197 , ElementType_t<T3> > &&
198 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
199 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
201 //**********************************************************************************************
202
203 public:
204 //**Type definitions****************************************************************************
207
210
215 using ReturnType = const ElementType;
216 using CompositeType = const ResultType;
217
219 using LeftOperand = If_t< IsExpression_v<VT>, const VT, const VT& >;
220
222 using RightOperand = If_t< IsExpression_v<MT>, const MT, const MT& >;
223
226
229 //**********************************************************************************************
230
231 //**Compilation flags***************************************************************************
233 static constexpr bool simdEnabled =
234 ( !IsDiagonal_v<MT> &&
235 VT::simdEnabled && MT::simdEnabled &&
236 HasSIMDAdd_v<VET,MET> &&
237 HasSIMDMult_v<VET,MET> );
238
240 static constexpr bool smpAssignable =
241 ( !evaluateVector && VT::smpAssignable && !evaluateMatrix && MT::smpAssignable );
242 //**********************************************************************************************
243
244 //**SIMD properties*****************************************************************************
246 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
247 //**********************************************************************************************
248
249 //**Constructor*********************************************************************************
255 inline TDVecDMatMultExpr( const VT& vec, const MT& mat ) noexcept
256 : vec_( vec ) // Left-hand side dense vector of the multiplication expression
257 , mat_( mat ) // Right-hand side dense matrix of the multiplication expression
258 {
259 BLAZE_INTERNAL_ASSERT( vec_.size() == mat_.rows(), "Invalid vector and matrix sizes" );
260 }
261 //**********************************************************************************************
262
263 //**Subscript operator**************************************************************************
269 inline ReturnType operator[]( size_t index ) const {
270 BLAZE_INTERNAL_ASSERT( index < mat_.columns(), "Invalid vector access index" );
271
272 if( IsDiagonal_v<MT> )
273 {
274 return vec_[index] * mat_(index,index);
275 }
276 else if( IsLower_v<MT> && ( index > 8UL ) )
277 {
278 const size_t begin( IsStrictlyLower_v<MT> ? index+1UL : index );
279 const size_t n ( mat_.rows() - begin );
280 return subvector( vec_, begin, n, unchecked ) *
281 subvector( column( mat_, index, unchecked ), begin, n, unchecked );
282 }
283 else if( IsUpper_v<MT> && ( index + 8UL < mat_.rows() ) )
284 {
285 const size_t n( IsStrictlyUpper_v<MT> ? index : index+1UL );
286 return subvector( vec_, 0UL, n, unchecked ) *
287 subvector( column( mat_, index, unchecked ), 0UL, n, unchecked );
288 }
289 else
290 {
291 return vec_ * column( mat_, index, unchecked );
292 }
293 }
294 //**********************************************************************************************
295
296 //**At function*********************************************************************************
303 inline ReturnType at( size_t index ) const {
304 if( index >= mat_.columns() ) {
305 BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
306 }
307 return (*this)[index];
308 }
309 //**********************************************************************************************
310
311 //**Size function*******************************************************************************
316 inline size_t size() const noexcept {
317 return mat_.columns();
318 }
319 //**********************************************************************************************
320
321 //**Left operand access*************************************************************************
326 inline LeftOperand leftOperand() const noexcept {
327 return vec_;
328 }
329 //**********************************************************************************************
330
331 //**Right operand access************************************************************************
336 inline RightOperand rightOperand() const noexcept {
337 return mat_;
338 }
339 //**********************************************************************************************
340
341 //**********************************************************************************************
347 template< typename T >
348 inline bool canAlias( const T* alias ) const noexcept {
349 return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
350 }
351 //**********************************************************************************************
352
353 //**********************************************************************************************
359 template< typename T >
360 inline bool isAliased( const T* alias ) const noexcept {
361 return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
362 }
363 //**********************************************************************************************
364
365 //**********************************************************************************************
370 inline bool isAligned() const noexcept {
371 return vec_.isAligned() && mat_.isAligned();
372 }
373 //**********************************************************************************************
374
375 //**********************************************************************************************
380 inline bool canSMPAssign() const noexcept {
381 return ( !BLAZE_BLAS_MODE ||
384 ( IsComputation_v<MT> && !evaluateMatrix ) ||
385 ( mat_.rows() * mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
386 ( size() > SMP_TDVECDMATMULT_THRESHOLD );
387 }
388 //**********************************************************************************************
389
390 private:
391 //**Member variables****************************************************************************
394 //**********************************************************************************************
395
396 //**Assignment to dense vectors*****************************************************************
409 template< typename VT1 > // Type of the target dense vector
410 friend inline void assign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
411 {
413
414 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
415
416 if( rhs.mat_.rows() == 0UL ||
417 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
418 reset( *lhs );
419 return;
420 }
421 else if( rhs.mat_.columns() == 0UL ) {
422 return;
423 }
424
425 LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
426 RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
427
428 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
429 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
430 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
431 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
432
433 TDVecDMatMultExpr::selectAssignKernel( *lhs, x, A );
434 }
436 //**********************************************************************************************
437
438 //**Assignment to dense vectors (kernel selection)**********************************************
449 template< typename VT1 // Type of the left-hand side target vector
450 , typename VT2 // Type of the left-hand side vector operand
451 , typename MT1 > // Type of the right-hand side matrix operand
452 static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A )
453 {
454 if( ( IsDiagonal_v<MT1> ) ||
455 ( IsComputation_v<MT> && !evaluateMatrix ) ||
456 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
457 selectSmallAssignKernel( y, x, A );
458 else
459 selectBlasAssignKernel( y, x, A );
460 }
462 //**********************************************************************************************
463
464 //**Default assignment to dense vectors*********************************************************
478 template< typename VT1 // Type of the left-hand side target vector
479 , typename VT2 // Type of the left-hand side vector operand
480 , typename MT1 > // Type of the right-hand side matrix operand
481 static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A )
482 {
483 const size_t M( A.rows() );
484 const size_t N( A.columns() );
485
486 if( IsStrictlyUpper_v<MT1> ) {
487 reset( y[0] );
488 }
489
490 if( !IsLower_v<MT1> )
491 {
492 const size_t jbegin( IsStrictlyUpper_v<MT1> ? 1UL : 0UL );
493 for( size_t j=jbegin; j<N; ++j ) {
494 y[j] = x[0UL] * A(0UL,j);
495 }
496 }
497
498 for( size_t i=( IsLower_v<MT1> && !IsStrictlyLower_v<MT1> ? 0UL : 1UL ); i<M; ++i )
499 {
500 if( IsDiagonal_v<MT1> )
501 {
502 y[i] = x[i] * A(i,i);
503 }
504 else
505 {
506 const size_t jbegin( ( IsUpper_v<MT1> )
507 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
508 :( 0UL ) );
509 const size_t jend( ( IsLower_v<MT1> )
510 ?( IsStrictlyLower_v<MT1> ? i-1UL : i )
511 :( N ) );
512 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
513
514 const size_t jnum( jend - jbegin );
515 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
516 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
517
518 for( size_t j=jbegin; j<jpos; j+=2UL ) {
519 y[j ] += x[i] * A(i,j );
520 y[j+1UL] += x[i] * A(i,j+1UL);
521 }
522 if( jpos < jend ) {
523 y[jpos] += x[i] * A(i,jpos);
524 }
525 if( IsLower_v<MT1> ) {
526 y[jend] = x[i] * A(i,jend);
527 }
528 }
529 }
530
531 if( IsStrictlyLower_v<MT1> ) {
532 reset( y[N-1UL] );
533 }
534 }
536 //**********************************************************************************************
537
538 //**Default assignment to dense vectors (small matrices)****************************************
552 template< typename VT1 // Type of the left-hand side target vector
553 , typename VT2 // Type of the left-hand side vector operand
554 , typename MT1 > // Type of the right-hand side matrix operand
555 static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
556 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
557 {
558 selectDefaultAssignKernel( y, x, A );
559 }
561 //**********************************************************************************************
562
563 //**Vectorized default assignment to dense vectors (small matrices)*****************************
577 template< typename VT1 // Type of the left-hand side target vector
578 , typename VT2 // Type of the left-hand side vector operand
579 , typename MT1 > // Type of the right-hand side matrix operand
580 static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
581 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
582 {
583 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
584
585 const size_t M( A.rows() );
586 const size_t N( A.columns() );
587
588 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
589 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
590
591 size_t j( 0UL );
592
593 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
594 {
595 const size_t ibegin( ( IsLower_v<MT1> )
596 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
597 :( 0UL ) );
598 const size_t iend( ( IsUpper_v<MT1> )
599 ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
600 :( M ) );
601 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
602
603 SIMDType x1( set( x[ibegin] ) );
604 SIMDType xmm1( x1 * A.load(ibegin,j ) );
605 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE ) );
606 SIMDType xmm3( x1 * A.load(ibegin,j+SIMDSIZE*2UL) );
607 SIMDType xmm4( x1 * A.load(ibegin,j+SIMDSIZE*3UL) );
608 SIMDType xmm5( x1 * A.load(ibegin,j+SIMDSIZE*4UL) );
609 SIMDType xmm6( x1 * A.load(ibegin,j+SIMDSIZE*5UL) );
610 SIMDType xmm7( x1 * A.load(ibegin,j+SIMDSIZE*6UL) );
611 SIMDType xmm8( x1 * A.load(ibegin,j+SIMDSIZE*7UL) );
612
613 for( size_t i=ibegin+1UL; i<iend; ++i ) {
614 x1 = set( x[i] );
615 xmm1 += x1 * A.load(i,j );
616 xmm2 += x1 * A.load(i,j+SIMDSIZE );
617 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
618 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
619 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
620 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
621 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
622 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
623 }
624
625 y.store( j , xmm1 );
626 y.store( j+SIMDSIZE , xmm2 );
627 y.store( j+SIMDSIZE*2UL, xmm3 );
628 y.store( j+SIMDSIZE*3UL, xmm4 );
629 y.store( j+SIMDSIZE*4UL, xmm5 );
630 y.store( j+SIMDSIZE*5UL, xmm6 );
631 y.store( j+SIMDSIZE*6UL, xmm7 );
632 y.store( j+SIMDSIZE*7UL, xmm8 );
633 }
634
635 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
636 {
637 const size_t ibegin( ( IsLower_v<MT1> )
638 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
639 :( 0UL ) );
640 const size_t iend( ( IsUpper_v<MT1> )
641 ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
642 :( M ) );
643 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
644
645 SIMDType x1( set( x[ibegin] ) );
646 SIMDType xmm1( x1 * A.load(ibegin,j ) );
647 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE ) );
648 SIMDType xmm3( x1 * A.load(ibegin,j+SIMDSIZE*2UL) );
649 SIMDType xmm4( x1 * A.load(ibegin,j+SIMDSIZE*3UL) );
650
651 for( size_t i=ibegin+1UL; i<iend; ++i ) {
652 x1 = set( x[i] );
653 xmm1 += x1 * A.load(i,j );
654 xmm2 += x1 * A.load(i,j+SIMDSIZE );
655 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
656 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
657 }
658
659 y.store( j , xmm1 );
660 y.store( j+SIMDSIZE , xmm2 );
661 y.store( j+SIMDSIZE*2UL, xmm3 );
662 y.store( j+SIMDSIZE*3UL, xmm4 );
663 }
664
665 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
666 {
667 const size_t ibegin( ( IsLower_v<MT1> )
668 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
669 :( 0UL ) );
670 const size_t iend( ( IsUpper_v<MT1> )
671 ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
672 :( M ) );
673 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
674
675 SIMDType x1( set( x[ibegin] ) );
676 SIMDType xmm1( x1 * A.load(ibegin,j ) );
677 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE ) );
678 SIMDType xmm3( x1 * A.load(ibegin,j+SIMDSIZE*2UL) );
679
680 for( size_t i=ibegin+1UL; i<iend; ++i ) {
681 x1 = set( x[i] );
682 xmm1 += x1 * A.load(i,j );
683 xmm2 += x1 * A.load(i,j+SIMDSIZE );
684 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
685 }
686
687 y.store( j , xmm1 );
688 y.store( j+SIMDSIZE , xmm2 );
689 y.store( j+SIMDSIZE*2UL, xmm3 );
690 }
691
692 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
693 {
694 const size_t ibegin( ( IsLower_v<MT1> )
695 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
696 :( 0UL ) );
697 const size_t iend( ( IsUpper_v<MT1> )
698 ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
699 :( M ) );
700 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
701
702 SIMDType x1( set( x[ibegin] ) );
703 SIMDType xmm1( x1 * A.load(ibegin,j ) );
704 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE) );
705
706 for( size_t i=ibegin+1UL; i<iend; ++i ) {
707 x1 = set( x[i] );
708 xmm1 += x1 * A.load(i,j );
709 xmm2 += x1 * A.load(i,j+SIMDSIZE);
710 }
711
712 y.store( j , xmm1 );
713 y.store( j+SIMDSIZE, xmm2 );
714 }
715
716 for( ; j<jpos; j+=SIMDSIZE )
717 {
718 const size_t ibegin( ( IsLower_v<MT1> )
719 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
720 :( 0UL ) );
721 const size_t iend( ( IsUpper_v<MT1> )
722 ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
723 :( M ) );
724 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
725
726 SIMDType xmm1( set( x[ibegin] ) * A.load(ibegin,j) );
727
728 for( size_t i=ibegin+1UL; i<iend; ++i ) {
729 xmm1 += set( x[i] ) * A.load(i,j);
730 }
731
732 y.store( j, xmm1 );
733 }
734
735 for( ; remainder && j<N; ++j )
736 {
737 const size_t ibegin( ( IsLower_v<MT1> )?( j ):( 0UL ) );
738 const size_t iend( ( IsUpper_v<MT1> )?( min( j+1UL, M ) ):( M ) );
739 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
740
741 ElementType value( x[ibegin] * A(ibegin,j) );
742
743 for( size_t i=ibegin+1UL; i<iend; ++i ) {
744 value += x[i] * A(i,j);
745 }
746
747 y[j] = value;
748 }
749 }
751 //**********************************************************************************************
752
753 //**Default assignment to dense vectors (large matrices)****************************************
767 template< typename VT1 // Type of the left-hand side target vector
768 , typename VT2 // Type of the left-hand side vector operand
769 , typename MT1 > // Type of the right-hand side matrix operand
770 static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
771 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
772 {
773 selectDefaultAssignKernel( y, x, A );
774 }
776 //**********************************************************************************************
777
778 //**Vectorized default assignment to dense vectors (large matrices)*****************************
792 template< typename VT1 // Type of the left-hand side target vector
793 , typename VT2 // Type of the left-hand side vector operand
794 , typename MT1 > // Type of the right-hand side matrix operand
795 static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
796 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
797 {
798 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
799
800 const size_t M( A.rows() );
801 const size_t N( A.columns() );
802
803 const size_t jblock( 32768UL / sizeof( ElementType ) );
804 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
805
806 BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
807
808 reset( y );
809
810 for( size_t jj=0U; jj<N; jj+=jblock ) {
811 for( size_t ii=0UL; ii<M; ii+=iblock )
812 {
813 const size_t iend( min( ii+iblock, M ) );
814 const size_t jtmp( min( jj+jblock, N ) );
815 const size_t jend( ( IsLower_v<MT1> )
816 ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
817 :( jtmp ) );
818
819 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
820 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
821
822 size_t j( ( IsUpper_v<MT1> )
823 ?( max( jj, prevMultiple( ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ), SIMDSIZE ) ) )
824 :( jj ) );
825
826 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
827 {
828 SIMDType x1( set( x[ii] ) );
829 SIMDType xmm1( x1 * A.load(ii,j ) );
830 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
831 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
832 SIMDType xmm4( x1 * A.load(ii,j+SIMDSIZE*3UL) );
833 SIMDType xmm5( x1 * A.load(ii,j+SIMDSIZE*4UL) );
834 SIMDType xmm6( x1 * A.load(ii,j+SIMDSIZE*5UL) );
835 SIMDType xmm7( x1 * A.load(ii,j+SIMDSIZE*6UL) );
836 SIMDType xmm8( x1 * A.load(ii,j+SIMDSIZE*7UL) );
837
838 for( size_t i=ii+1UL; i<iend; ++i ) {
839 x1 = set( x[i] );
840 xmm1 += x1 * A.load(i,j );
841 xmm2 += x1 * A.load(i,j+SIMDSIZE );
842 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
843 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
844 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
845 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
846 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
847 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
848 }
849
850 y.store( j , y.load(j ) + xmm1 );
851 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
852 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
853 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
854 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
855 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
856 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
857 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
858 }
859
860 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
861 {
862 SIMDType x1( set( x[ii] ) );
863 SIMDType xmm1( x1 * A.load(ii,j ) );
864 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
865 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
866 SIMDType xmm4( x1 * A.load(ii,j+SIMDSIZE*3UL) );
867
868 for( size_t i=ii+1UL; i<iend; ++i ) {
869 x1 = set( x[i] );
870 xmm1 += x1 * A.load(i,j );
871 xmm2 += x1 * A.load(i,j+SIMDSIZE );
872 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
873 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
874 }
875
876 y.store( j , y.load(j ) + xmm1 );
877 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
878 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
879 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
880 }
881
882 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
883 {
884 SIMDType x1( set( x[ii] ) );
885 SIMDType xmm1( x1 * A.load(ii,j ) );
886 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
887 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
888
889 for( size_t i=ii+1UL; i<iend; ++i ) {
890 x1 = set( x[i] );
891 xmm1 += x1 * A.load(i,j );
892 xmm2 += x1 * A.load(i,j+SIMDSIZE );
893 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
894 }
895
896 y.store( j , y.load(j ) + xmm1 );
897 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
898 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
899 }
900
901 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
902 {
903 SIMDType x1( set( x[ii] ) );
904 SIMDType xmm1( x1 * A.load(ii,j ) );
905 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE) );
906
907 for( size_t i=ii+1UL; i<iend; ++i ) {
908 x1 = set( x[i] );
909 xmm1 += x1 * A.load(i,j );
910 xmm2 += x1 * A.load(i,j+SIMDSIZE);
911 }
912
913 y.store( j , y.load(j ) + xmm1 );
914 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
915 }
916
917 for( ; j<jpos; j+=SIMDSIZE )
918 {
919 SIMDType xmm1( set( x[ii] ) * A.load(ii,j) );
920
921 for( size_t i=ii+1UL; i<iend; ++i ) {
922 xmm1 += set( x[i] ) * A.load(i,j);
923 }
924
925 y.store( j, y.load(j) + xmm1 );
926 }
927
928 for( ; remainder && j<jend; ++j )
929 {
930 ElementType value( x[ii] * A(ii,j) );
931
932 for( size_t i=ii+1UL; i<iend; ++i ) {
933 value += x[i] * A(i,j);
934 }
935
936 y[j] += value;
937 }
938 }
939 }
940 }
942 //**********************************************************************************************
943
944 //**BLAS-based assignment to dense vectors (default)********************************************
958 template< typename VT1 // Type of the left-hand side target vector
959 , typename VT2 // Type of the left-hand side vector operand
960 , typename MT1 > // Type of the right-hand side matrix operand
961 static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
962 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
963 {
964 selectLargeAssignKernel( y, x, A );
965 }
967 //**********************************************************************************************
968
969 //**BLAS-based assignment to dense vectors******************************************************
970#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
984 template< typename VT1 // Type of the left-hand side target vector
985 , typename VT2 // Type of the left-hand side vector operand
986 , typename MT1 > // Type of the right-hand side matrix operand
987 static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
988 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
989 {
990 using ET = ElementType_t<VT1>;
991
992 if( IsTriangular_v<MT1> ) {
993 assign( y, x );
994 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
995 }
996 else {
997 gemv( y, x, A, ET(1), ET(0) );
998 }
999 }
1001#endif
1002 //**********************************************************************************************
1003
1004 //**Assignment to sparse vectors****************************************************************
1017 template< typename VT1 > // Type of the target sparse vector
1018 friend inline void assign( SparseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1019 {
1021
1025
1026 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
1027
1028 const ResultType tmp( serial( rhs ) );
1029 assign( *lhs, tmp );
1030 }
1032 //**********************************************************************************************
1033
1034 //**Addition assignment to dense vectors********************************************************
1047 template< typename VT1 > // Type of the target dense vector
1048 friend inline void addAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1049 {
1051
1052 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
1053
1054 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
1055 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
1056 return;
1057 }
1058
1059 LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1060 RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1061
1062 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1063 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1064 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1065 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
1066
1067 TDVecDMatMultExpr::selectAddAssignKernel( *lhs, x, A );
1068 }
1070 //**********************************************************************************************
1071
1072 //**Addition assignment to dense vectors (kernel selection)*************************************
1083 template< typename VT1 // Type of the left-hand side target vector
1084 , typename VT2 // Type of the left-hand side vector operand
1085 , typename MT1 > // Type of the right-hand side matrix operand
1086 static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1087 {
1088 if( ( IsDiagonal_v<MT1> ) ||
1089 ( IsComputation_v<MT> && !evaluateMatrix ) ||
1090 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1091 selectSmallAddAssignKernel( y, x, A );
1092 else
1093 selectBlasAddAssignKernel( y, x, A );
1094 }
1096 //**********************************************************************************************
1097
1098 //**Default addition assignment to dense vectors************************************************
1112 template< typename VT1 // Type of the left-hand side target vector
1113 , typename VT2 // Type of the left-hand side vector operand
1114 , typename MT1 > // Type of the right-hand side matrix operand
1115 static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1116 {
1117 const size_t M( A.rows() );
1118 const size_t N( A.columns() );
1119
1120 for( size_t i=0UL; i<M; ++i )
1121 {
1122 if( IsDiagonal_v<MT1> )
1123 {
1124 y[i] += x[i] * A(i,i);
1125 }
1126 else
1127 {
1128 const size_t jbegin( ( IsUpper_v<MT1> )
1129 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1130 :( 0UL ) );
1131 const size_t jend( ( IsLower_v<MT1> )
1132 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1133 :( N ) );
1134 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1135
1136 const size_t jnum( jend - jbegin );
1137 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
1138 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
1139
1140 for( size_t j=jbegin; j<jpos; j+=2UL ) {
1141 y[j ] += x[i] * A(i,j );
1142 y[j+1UL] += x[i] * A(i,j+1UL);
1143 }
1144 if( jpos < jend ) {
1145 y[jpos] += x[i] * A(i,jpos);
1146 }
1147 }
1148 }
1149 }
1151 //**********************************************************************************************
1152
1153 //**Default addition assignment to dense vectors (small matrices)*******************************
1167 template< typename VT1 // Type of the left-hand side target vector
1168 , typename VT2 // Type of the left-hand side vector operand
1169 , typename MT1 > // Type of the right-hand side matrix operand
1170 static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1171 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1172 {
1173 selectDefaultAddAssignKernel( y, x, A );
1174 }
1176 //**********************************************************************************************
1177
1178 //**Vectorized default addition assignment to dense vectors (small matrices)********************
1192 template< typename VT1 // Type of the left-hand side target vector
1193 , typename VT2 // Type of the left-hand side vector operand
1194 , typename MT1 > // Type of the right-hand side matrix operand
1195 static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1196 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1197 {
1198 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1199
1200 const size_t M( A.rows() );
1201 const size_t N( A.columns() );
1202
1203 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
1204 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
1205
1206 size_t j( 0UL );
1207
1208 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1209 {
1210 const size_t ibegin( ( IsLower_v<MT1> )
1211 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1212 :( 0UL ) );
1213 const size_t iend( ( IsUpper_v<MT1> )
1214 ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1215 :( M ) );
1216 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1217
1218 SIMDType xmm1( y.load(j ) );
1219 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1220 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1221 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1222 SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1223 SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1224 SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1225 SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1226
1227 for( size_t i=ibegin; i<iend; ++i ) {
1228 const SIMDType x1( set( x[i] ) );
1229 xmm1 += x1 * A.load(i,j );
1230 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1231 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1232 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1233 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1234 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1235 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1236 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1237 }
1238
1239 y.store( j , xmm1 );
1240 y.store( j+SIMDSIZE , xmm2 );
1241 y.store( j+SIMDSIZE*2UL, xmm3 );
1242 y.store( j+SIMDSIZE*3UL, xmm4 );
1243 y.store( j+SIMDSIZE*4UL, xmm5 );
1244 y.store( j+SIMDSIZE*5UL, xmm6 );
1245 y.store( j+SIMDSIZE*6UL, xmm7 );
1246 y.store( j+SIMDSIZE*7UL, xmm8 );
1247 }
1248
1249 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1250 {
1251 const size_t ibegin( ( IsLower_v<MT1> )
1252 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1253 :( 0UL ) );
1254 const size_t iend( ( IsUpper_v<MT1> )
1255 ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1256 :( M ) );
1257 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1258
1259 SIMDType xmm1( y.load(j ) );
1260 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1261 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1262 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1263
1264 for( size_t i=ibegin; i<iend; ++i ) {
1265 const SIMDType x1( set( x[i] ) );
1266 xmm1 += x1 * A.load(i,j );
1267 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1268 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1269 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1270 }
1271
1272 y.store( j , xmm1 );
1273 y.store( j+SIMDSIZE , xmm2 );
1274 y.store( j+SIMDSIZE*2UL, xmm3 );
1275 y.store( j+SIMDSIZE*3UL, xmm4 );
1276 }
1277
1278 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1279 {
1280 const size_t ibegin( ( IsLower_v<MT1> )
1281 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1282 :( 0UL ) );
1283 const size_t iend( ( IsUpper_v<MT1> )
1284 ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1285 :( M ) );
1286 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1287
1288 SIMDType xmm1( y.load(j ) );
1289 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1290 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1291
1292 for( size_t i=ibegin; i<iend; ++i ) {
1293 const SIMDType x1( set( x[i] ) );
1294 xmm1 += x1 * A.load(i,j );
1295 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1296 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1297 }
1298
1299 y.store( j , xmm1 );
1300 y.store( j+SIMDSIZE , xmm2 );
1301 y.store( j+SIMDSIZE*2UL, xmm3 );
1302 }
1303
1304 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1305 {
1306 const size_t ibegin( ( IsLower_v<MT1> )
1307 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1308 :( 0UL ) );
1309 const size_t iend( ( IsUpper_v<MT1> )
1310 ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1311 :( M ) );
1312 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1313
1314 SIMDType xmm1( y.load(j ) );
1315 SIMDType xmm2( y.load(j+SIMDSIZE) );
1316
1317 for( size_t i=ibegin; i<iend; ++i ) {
1318 const SIMDType x1( set( x[i] ) );
1319 xmm1 += x1 * A.load(i,j );
1320 xmm2 += x1 * A.load(i,j+SIMDSIZE);
1321 }
1322
1323 y.store( j , xmm1 );
1324 y.store( j+SIMDSIZE, xmm2 );
1325 }
1326
1327 for( ; j<jpos; j+=SIMDSIZE )
1328 {
1329 const size_t ibegin( ( IsLower_v<MT1> )
1330 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1331 :( 0UL ) );
1332 const size_t iend( ( IsUpper_v<MT1> )
1333 ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1334 :( M ) );
1335 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1336
1337 SIMDType xmm1( y.load(j) );
1338
1339 for( size_t i=ibegin; i<iend; ++i ) {
1340 xmm1 += set( x[i] ) * A.load(i,j);
1341 }
1342
1343 y.store( j, xmm1 );
1344 }
1345
1346 for( ; remainder && j<N; ++j )
1347 {
1348 const size_t ibegin( ( IsLower_v<MT1> )?( j ):( 0UL ) );
1349 const size_t iend( ( IsUpper_v<MT1> )?( min( j+1UL, M ) ):( M ) );
1350 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1351
1352 ElementType value( x[ibegin] * A(ibegin,j) );
1353
1354 for( size_t i=ibegin+1UL; i<iend; ++i ) {
1355 value += x[i] * A(i,j);
1356 }
1357
1358 y[j] += value;
1359 }
1360 }
1362 //**********************************************************************************************
1363
1364 //**Default addition assignment to dense vectors (large matrices)*******************************
1378 template< typename VT1 // Type of the left-hand side target vector
1379 , typename VT2 // Type of the left-hand side vector operand
1380 , typename MT1 > // Type of the right-hand side matrix operand
1381 static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1382 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1383 {
1384 selectDefaultAddAssignKernel( y, x, A );
1385 }
1387 //**********************************************************************************************
1388
1389 //**Vectorized default addition assignment to dense vectors (large matrices)********************
1403 template< typename VT1 // Type of the left-hand side target vector
1404 , typename VT2 // Type of the left-hand side vector operand
1405 , typename MT1 > // Type of the right-hand side matrix operand
1406 static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1407 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1408 {
1409 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1410
1411 const size_t M( A.rows() );
1412 const size_t N( A.columns() );
1413
1414 const size_t jblock( 32768UL / sizeof( ElementType ) );
1415 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1416
1417 BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1418
1419 for( size_t jj=0U; jj<N; jj+=jblock ) {
1420 for( size_t ii=0UL; ii<M; ii+=iblock )
1421 {
1422 const size_t iend( min( ii+iblock, M ) );
1423 const size_t jtmp( min( jj+jblock, N ) );
1424 const size_t jend( ( IsLower_v<MT1> )
1425 ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
1426 :( jtmp ) );
1427
1428 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
1429 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
1430
1431 size_t j( ( IsUpper_v<MT1> )
1432 ?( max( jj, prevMultiple( ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ), SIMDSIZE ) ) )
1433 :( jj ) );
1434
1435 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1436 {
1437 SIMDType x1( set( x[ii] ) );
1438 SIMDType xmm1( x1 * A.load(ii,j ) );
1439 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
1440 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
1441 SIMDType xmm4( x1 * A.load(ii,j+SIMDSIZE*3UL) );
1442 SIMDType xmm5( x1 * A.load(ii,j+SIMDSIZE*4UL) );
1443 SIMDType xmm6( x1 * A.load(ii,j+SIMDSIZE*5UL) );
1444 SIMDType xmm7( x1 * A.load(ii,j+SIMDSIZE*6UL) );
1445 SIMDType xmm8( x1 * A.load(ii,j+SIMDSIZE*7UL) );
1446
1447 for( size_t i=ii+1UL; i<iend; ++i ) {
1448 x1 = set( x[i] );
1449 xmm1 += x1 * A.load(i,j );
1450 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1451 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1452 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1453 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1454 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1455 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1456 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1457 }
1458
1459 y.store( j , y.load(j ) + xmm1 );
1460 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1461 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1462 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1463 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
1464 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
1465 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
1466 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
1467 }
1468
1469 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1470 {
1471 SIMDType x1( set( x[ii] ) );
1472 SIMDType xmm1( x1 * A.load(ii,j ) );
1473 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
1474 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
1475 SIMDType xmm4( x1 * A.load(ii,j+SIMDSIZE*3UL) );
1476
1477 for( size_t i=ii+1UL; i<iend; ++i ) {
1478 x1 = set( x[i] );
1479 xmm1 += x1 * A.load(i,j );
1480 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1481 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1482 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1483 }
1484
1485 y.store( j , y.load(j ) + xmm1 );
1486 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1487 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1488 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1489 }
1490
1491 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1492 {
1493 SIMDType x1( set( x[ii] ) );
1494 SIMDType xmm1( x1 * A.load(ii,j ) );
1495 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
1496 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
1497
1498 for( size_t i=ii+1UL; i<iend; ++i ) {
1499 x1 = set( x[i] );
1500 xmm1 += x1 * A.load(i,j );
1501 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1502 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1503 }
1504
1505 y.store( j , y.load(j ) + xmm1 );
1506 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1507 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1508 }
1509
1510 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1511 {
1512 SIMDType x1( set( x[ii] ) );
1513 SIMDType xmm1( x1 * A.load(ii,j ) );
1514 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE) );
1515
1516 for( size_t i=ii+1UL; i<iend; ++i ) {
1517 x1 = set( x[i] );
1518 xmm1 += x1 * A.load(i,j );
1519 xmm2 += x1 * A.load(i,j+SIMDSIZE);
1520 }
1521
1522 y.store( j , y.load(j ) + xmm1 );
1523 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
1524 }
1525
1526 for( ; j<jpos; j+=SIMDSIZE )
1527 {
1528 SIMDType xmm1( set( x[ii] ) * A.load(ii,j) );
1529
1530 for( size_t i=ii+1UL; i<iend; ++i ) {
1531 xmm1 += set( x[i] ) * A.load(i,j);
1532 }
1533
1534 y.store( j, y.load(j) + xmm1 );
1535 }
1536
1537 for( ; remainder && j<jend; ++j )
1538 {
1539 ElementType value( x[ii] * A(ii,j) );
1540
1541 for( size_t i=ii+1UL; i<iend; ++i ) {
1542 value += x[i] * A(i,j);
1543 }
1544
1545 y[j] += value;
1546 }
1547 }
1548 }
1549 }
1551 //**********************************************************************************************
1552
1553 //**BLAS-based addition assignment to dense vectors (default)***********************************
1567 template< typename VT1 // Type of the left-hand side target vector
1568 , typename VT2 // Type of the left-hand side vector operand
1569 , typename MT1 > // Type of the right-hand side matrix operand
1570 static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1571 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1572 {
1573 selectLargeAddAssignKernel( y, x, A );
1574 }
1576 //**********************************************************************************************
1577
1578 //**BLAS-based addition assignment to dense vectors*********************************************
1579#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1593 template< typename VT1 // Type of the left-hand side target vector
1594 , typename VT2 // Type of the left-hand side vector operand
1595 , typename MT1 > // Type of the right-hand side matrix operand
1596 static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1597 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1598 {
1599 using ET = ElementType_t<VT1>;
1600
1601 if( IsTriangular_v<MT1> ) {
1602 ResultType_t<VT1> tmp( serial( x ) );
1603 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1604 addAssign( y, tmp );
1605 }
1606 else {
1607 gemv( y, x, A, ET(1), ET(1) );
1608 }
1609 }
1611#endif
1612 //**********************************************************************************************
1613
1614 //**Addition assignment to sparse vectors*******************************************************
1615 // No special implementation for the addition assignment to sparse vectors.
1616 //**********************************************************************************************
1617
1618 //**Subtraction assignment to dense vectors*****************************************************
1631 template< typename VT1 > // Type of the target dense vector
1632 friend inline void subAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1633 {
1635
1636 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
1637
1638 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
1639 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
1640 return;
1641 }
1642
1643 LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1644 RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1645
1646 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1647 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1648 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1649 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
1650
1651 TDVecDMatMultExpr::selectSubAssignKernel( *lhs, x, A );
1652 }
1654 //**********************************************************************************************
1655
1656 //**Subtraction assignment to dense vectors (kernel selection)**********************************
1667 template< typename VT1 // Type of the left-hand side target vector
1668 , typename VT2 // Type of the left-hand side vector operand
1669 , typename MT1 > // Type of the right-hand side matrix operand
1670 static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1671 {
1672 if( ( IsDiagonal_v<MT1> ) ||
1673 ( IsComputation_v<MT> && !evaluateMatrix ) ||
1674 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1675 selectSmallSubAssignKernel( y, x, A );
1676 else
1677 selectBlasSubAssignKernel( y, x, A );
1678 }
1680 //**********************************************************************************************
1681
1682 //**Default subtraction assignment to dense vectors*********************************************
1696 template< typename VT1 // Type of the left-hand side target vector
1697 , typename VT2 // Type of the left-hand side vector operand
1698 , typename MT1 > // Type of the right-hand side matrix operand
1699 static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1700 {
1701 const size_t M( A.rows() );
1702 const size_t N( A.columns() );
1703
1704 for( size_t i=0UL; i<M; ++i )
1705 {
1706 if( IsDiagonal_v<MT1> )
1707 {
1708 y[i] -= x[i] * A(i,i);
1709 }
1710 else
1711 {
1712 const size_t jbegin( ( IsUpper_v<MT1> )
1713 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1714 :( 0UL ) );
1715 const size_t jend( ( IsLower_v<MT1> )
1716 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1717 :( N ) );
1718 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1719
1720 const size_t jnum( jend - jbegin );
1721 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
1722 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
1723
1724 for( size_t j=jbegin; j<jpos; j+=2UL ) {
1725 y[j ] -= x[i] * A(i,j );
1726 y[j+1UL] -= x[i] * A(i,j+1UL);
1727 }
1728 if( jpos < jend ) {
1729 y[jpos] -= x[i] * A(i,jpos);
1730 }
1731 }
1732 }
1733 }
1735 //**********************************************************************************************
1736
1737 //**Default subtraction assignment to dense vectors (small matrices)****************************
1751 template< typename VT1 // Type of the left-hand side target vector
1752 , typename VT2 // Type of the left-hand side vector operand
1753 , typename MT1 > // Type of the right-hand side matrix operand
1754 static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1755 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1756 {
1757 selectDefaultSubAssignKernel( y, x, A );
1758 }
1760 //**********************************************************************************************
1761
1762 //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1777 template< typename VT1 // Type of the left-hand side target vector
1778 , typename VT2 // Type of the left-hand side vector operand
1779 , typename MT1 > // Type of the right-hand side matrix operand
1780 static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1781 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1782 {
1783 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1784
1785 const size_t M( A.rows() );
1786 const size_t N( A.columns() );
1787
1788 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
1789 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
1790
1791 size_t j( 0UL );
1792
1793 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1794 {
1795 const size_t ibegin( ( IsLower_v<MT1> )
1796 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1797 :( 0UL ) );
1798 const size_t iend( ( IsUpper_v<MT1> )
1799 ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1800 :( M ) );
1801 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1802
1803 SIMDType xmm1( y.load(j ) );
1804 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1805 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1806 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1807 SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1808 SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1809 SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1810 SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1811
1812 for( size_t i=ibegin; i<iend; ++i ) {
1813 const SIMDType x1( set( x[i] ) );
1814 xmm1 -= x1 * A.load(i,j );
1815 xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1816 xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1817 xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1818 xmm5 -= x1 * A.load(i,j+SIMDSIZE*4UL);
1819 xmm6 -= x1 * A.load(i,j+SIMDSIZE*5UL);
1820 xmm7 -= x1 * A.load(i,j+SIMDSIZE*6UL);
1821 xmm8 -= x1 * A.load(i,j+SIMDSIZE*7UL);
1822 }
1823
1824 y.store( j , xmm1 );
1825 y.store( j+SIMDSIZE , xmm2 );
1826 y.store( j+SIMDSIZE*2UL, xmm3 );
1827 y.store( j+SIMDSIZE*3UL, xmm4 );
1828 y.store( j+SIMDSIZE*4UL, xmm5 );
1829 y.store( j+SIMDSIZE*5UL, xmm6 );
1830 y.store( j+SIMDSIZE*6UL, xmm7 );
1831 y.store( j+SIMDSIZE*7UL, xmm8 );
1832 }
1833
1834 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1835 {
1836 const size_t ibegin( ( IsLower_v<MT1> )
1837 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1838 :( 0UL ) );
1839 const size_t iend( ( IsUpper_v<MT1> )
1840 ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1841 :( M ) );
1842 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1843
1844 SIMDType xmm1( y.load(j ) );
1845 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1846 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1847 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1848
1849 for( size_t i=ibegin; i<iend; ++i ) {
1850 const SIMDType x1( set( x[i] ) );
1851 xmm1 -= x1 * A.load(i,j );
1852 xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1853 xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1854 xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1855 }
1856
1857 y.store( j , xmm1 );
1858 y.store( j+SIMDSIZE , xmm2 );
1859 y.store( j+SIMDSIZE*2UL, xmm3 );
1860 y.store( j+SIMDSIZE*3UL, xmm4 );
1861 }
1862
1863 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1864 {
1865 const size_t ibegin( ( IsLower_v<MT1> )
1866 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1867 :( 0UL ) );
1868 const size_t iend( ( IsUpper_v<MT1> )
1869 ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1870 :( M ) );
1871 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1872
1873 SIMDType xmm1( y.load(j ) );
1874 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1875 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1876
1877 for( size_t i=ibegin; i<iend; ++i ) {
1878 const SIMDType x1( set( x[i] ) );
1879 xmm1 -= x1 * A.load(i,j );
1880 xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1881 xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1882 }
1883
1884 y.store( j , xmm1 );
1885 y.store( j+SIMDSIZE , xmm2 );
1886 y.store( j+SIMDSIZE*2UL, xmm3 );
1887 }
1888
1889 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1890 {
1891 const size_t ibegin( ( IsLower_v<MT1> )
1892 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1893 :( 0UL ) );
1894 const size_t iend( ( IsUpper_v<MT1> )
1895 ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1896 :( M ) );
1897 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1898
1899 SIMDType xmm1( y.load(j ) );
1900 SIMDType xmm2( y.load(j+SIMDSIZE) );
1901
1902 for( size_t i=ibegin; i<iend; ++i ) {
1903 const SIMDType x1( set( x[i] ) );
1904 xmm1 -= x1 * A.load(i,j );
1905 xmm2 -= x1 * A.load(i,j+SIMDSIZE);
1906 }
1907
1908 y.store( j , xmm1 );
1909 y.store( j+SIMDSIZE, xmm2 );
1910 }
1911
1912 for( ; j<jpos; j+=SIMDSIZE )
1913 {
1914 const size_t ibegin( ( IsLower_v<MT1> )
1915 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1916 :( 0UL ) );
1917 const size_t iend( ( IsUpper_v<MT1> )
1918 ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1919 :( M ) );
1920 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1921
1922 SIMDType xmm1( y.load(j) );
1923
1924 for( size_t i=ibegin; i<iend; ++i ) {
1925 xmm1 -= set( x[i] ) * A.load(i,j);
1926 }
1927
1928 y.store( j, xmm1 );
1929 }
1930
1931 for( ; remainder && j<N; ++j )
1932 {
1933 const size_t ibegin( ( IsLower_v<MT1> )?( j ):( 0UL ) );
1934 const size_t iend( ( IsUpper_v<MT1> )?( min( j+1UL, M ) ):( M ) );
1935 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1936
1937 ElementType value( x[ibegin] * A(ibegin,j) );
1938
1939 for( size_t i=ibegin+1UL; i<iend; ++i ) {
1940 value += x[i] * A(i,j);
1941 }
1942
1943 y[j] -= value;
1944 }
1945 }
1947 //**********************************************************************************************
1948
1949 //**Default subtraction assignment to dense vectors (large matrices)****************************
1963 template< typename VT1 // Type of the left-hand side target vector
1964 , typename VT2 // Type of the left-hand side vector operand
1965 , typename MT1 > // Type of the right-hand side matrix operand
1966 static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1967 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1968 {
1969 selectDefaultSubAssignKernel( y, x, A );
1970 }
1972 //**********************************************************************************************
1973
1974 //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
1989 template< typename VT1 // Type of the left-hand side target vector
1990 , typename VT2 // Type of the left-hand side vector operand
1991 , typename MT1 > // Type of the right-hand side matrix operand
1992 static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1993 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1994 {
1995 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1996
1997 const size_t M( A.rows() );
1998 const size_t N( A.columns() );
1999
2000 const size_t jblock( 32768UL / sizeof( ElementType ) );
2001 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
2002
2003 BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
2004
2005 for( size_t jj=0U; jj<N; jj+=jblock ) {
2006 for( size_t ii=0UL; ii<M; ii+=iblock )
2007 {
2008 const size_t iend( min( ii+iblock, M ) );
2009 const size_t jtmp( min( jj+jblock, N ) );
2010 const size_t jend( ( IsLower_v<MT1> )
2011 ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
2012 :( jtmp ) );
2013
2014 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
2015 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
2016
2017 size_t j( ( IsUpper_v<MT1> )
2018 ?( max( jj, prevMultiple( ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ), SIMDSIZE ) ) )
2019 :( jj ) );
2020
2021 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
2022 {
2023 SIMDType x1( set( x[ii] ) );
2024 SIMDType xmm1( x1 * A.load(ii,j ) );
2025 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
2026 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
2027 SIMDType xmm4( x1 * A.load(ii,j+SIMDSIZE*3UL) );
2028 SIMDType xmm5( x1 * A.load(ii,j+SIMDSIZE*4UL) );
2029 SIMDType xmm6( x1 * A.load(ii,j+SIMDSIZE*5UL) );
2030 SIMDType xmm7( x1 * A.load(ii,j+SIMDSIZE*6UL) );
2031 SIMDType xmm8( x1 * A.load(ii,j+SIMDSIZE*7UL) );
2032
2033 for( size_t i=ii+1UL; i<iend; ++i ) {
2034 x1 = set( x[i] );
2035 xmm1 += x1 * A.load(i,j );
2036 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2037 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2038 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2039 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
2040 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
2041 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
2042 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
2043 }
2044
2045 y.store( j , y.load(j ) - xmm1 );
2046 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2047 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2048 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
2049 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5 );
2050 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6 );
2051 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7 );
2052 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8 );
2053 }
2054
2055 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2056 {
2057 SIMDType x1( set( x[ii] ) );
2058 SIMDType xmm1( x1 * A.load(ii,j ) );
2059 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
2060 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
2061 SIMDType xmm4( x1 * A.load(ii,j+SIMDSIZE*3UL) );
2062
2063 for( size_t i=ii+1UL; i<iend; ++i ) {
2064 x1 = set( x[i] );
2065 xmm1 += x1 * A.load(i,j );
2066 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2067 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2068 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2069 }
2070
2071 y.store( j , y.load(j ) - xmm1 );
2072 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2073 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2074 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
2075 }
2076
2077 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2078 {
2079 SIMDType x1( set( x[ii] ) );
2080 SIMDType xmm1( x1 * A.load(ii,j ) );
2081 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
2082 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
2083
2084 for( size_t i=ii+1UL; i<iend; ++i ) {
2085 x1 = set( x[i] );
2086 xmm1 += x1 * A.load(i,j );
2087 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2088 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2089 }
2090
2091 y.store( j , y.load(j ) - xmm1 );
2092 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2093 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2094 }
2095
2096 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2097 {
2098 SIMDType x1( set( x[ii] ) );
2099 SIMDType xmm1( x1 * A.load(ii,j ) );
2100 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE) );
2101
2102 for( size_t i=ii+1UL; i<iend; ++i ) {
2103 x1 = set( x[i] );
2104 xmm1 += x1 * A.load(i,j );
2105 xmm2 += x1 * A.load(i,j+SIMDSIZE);
2106 }
2107
2108 y.store( j , y.load(j ) - xmm1 );
2109 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2 );
2110 }
2111
2112 for( ; j<jpos; j+=SIMDSIZE )
2113 {
2114 SIMDType xmm1( set( x[ii] ) * A.load(ii,j) );
2115
2116 for( size_t i=ii+1UL; i<iend; ++i ) {
2117 xmm1 += set( x[i] ) * A.load(i,j);
2118 }
2119
2120 y.store( j, y.load(j) - xmm1 );
2121 }
2122
2123 for( ; remainder && j<jend; ++j )
2124 {
2125 ElementType value( x[ii] * A(ii,j) );
2126
2127 for( size_t i=ii+1UL; i<iend; ++i ) {
2128 value += x[i] * A(i,j);
2129 }
2130
2131 y[j] -= value;
2132 }
2133 }
2134 }
2135 }
2137 //**********************************************************************************************
2138
2139 //**BLAS-based subtraction assignment to dense vectors (default)********************************
2153 template< typename VT1 // Type of the left-hand side target vector
2154 , typename VT2 // Type of the left-hand side vector operand
2155 , typename MT1 > // Type of the right-hand side matrix operand
2156 static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2157 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2158 {
2159 selectLargeSubAssignKernel( y, x, A );
2160 }
2162 //**********************************************************************************************
2163
2164 //**BLAS-based subtraction assignment to dense vectors******************************************
2165#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2179 template< typename VT1 // Type of the left-hand side target vector
2180 , typename VT2 // Type of the left-hand side vector operand
2181 , typename MT1 > // Type of the right-hand side matrix operand
2182 static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2183 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2184 {
2185 using ET = ElementType_t<VT1>;
2186
2187 if( IsTriangular_v<MT1> ) {
2188 ResultType_t<VT1> tmp( serial( x ) );
2189 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2190 subAssign( y, tmp );
2191 }
2192 else {
2193 gemv( y, x, A, ET(-1), ET(1) );
2194 }
2195 }
2197#endif
2198 //**********************************************************************************************
2199
2200 //**Subtraction assignment to sparse vectors****************************************************
2201 // No special implementation for the subtraction assignment to sparse vectors.
2202 //**********************************************************************************************
2203
2204 //**Multiplication assignment to dense vectors**************************************************
2217 template< typename VT1 > // Type of the target dense vector
2218 friend inline void multAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2219 {
2221
2225
2226 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2227
2228 const ResultType tmp( serial( rhs ) );
2229 multAssign( *lhs, tmp );
2230 }
2232 //**********************************************************************************************
2233
2234 //**Multiplication assignment to sparse vectors*************************************************
2235 // No special implementation for the multiplication assignment to sparse vectors.
2236 //**********************************************************************************************
2237
2238 //**Division assignment to dense vectors********************************************************
2251 template< typename VT1 > // Type of the target dense vector
2252 friend inline void divAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2253 {
2255
2259
2260 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2261
2262 const ResultType tmp( serial( rhs ) );
2263 divAssign( *lhs, tmp );
2264 }
2266 //**********************************************************************************************
2267
2268 //**Division assignment to sparse vectors*******************************************************
2269 // No special implementation for the division assignment to sparse vectors.
2270 //**********************************************************************************************
2271
2272 //**SMP assignment to dense vectors*************************************************************
2287 template< typename VT1 > // Type of the target dense vector
2288 friend inline auto smpAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2289 -> EnableIf_t< UseSMPAssign_v<VT1> >
2290 {
2292
2293 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2294
2295 if( rhs.mat_.rows() == 0UL ||
2296 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2297 reset( *lhs );
2298 return;
2299 }
2300 else if( rhs.mat_.columns() == 0UL ) {
2301 return;
2302 }
2303
2304 LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2305 RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2306
2307 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2308 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2309 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2310 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
2311
2312 smpAssign( *lhs, x * A );
2313 }
2315 //**********************************************************************************************
2316
2317 //**SMP assignment to sparse vectors************************************************************
2332 template< typename VT1 > // Type of the target sparse vector
2333 friend inline auto smpAssign( SparseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2334 -> EnableIf_t< UseSMPAssign_v<VT1> >
2335 {
2337
2341
2342 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2343
2344 const ResultType tmp( rhs );
2345 smpAssign( *lhs, tmp );
2346 }
2348 //**********************************************************************************************
2349
2350 //**SMP addition assignment to dense vectors****************************************************
2365 template< typename VT1 > // Type of the target dense vector
2366 friend inline auto smpAddAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2367 -> EnableIf_t< UseSMPAssign_v<VT1> >
2368 {
2370
2371 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2372
2373 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2374 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2375 return;
2376 }
2377
2378 LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2379 RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2380
2381 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2382 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2383 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2384 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
2385
2386 smpAddAssign( *lhs, x * A );
2387 }
2389 //**********************************************************************************************
2390
2391 //**SMP addition assignment to sparse vectors***************************************************
2392 // No special implementation for the SMP addition assignment to sparse vectors.
2393 //**********************************************************************************************
2394
2395 //**SMP subtraction assignment to dense vectors*************************************************
2410 template< typename VT1 > // Type of the target dense vector
2411 friend inline auto smpSubAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2412 -> EnableIf_t< UseSMPAssign_v<VT1> >
2413 {
2415
2416 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2417
2418 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2419 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2420 return;
2421 }
2422
2423 LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2424 RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2425
2426 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2427 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2428 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2429 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
2430
2431 smpSubAssign( *lhs, x * A );
2432 }
2434 //**********************************************************************************************
2435
2436 //**SMP subtraction assignment to sparse vectors************************************************
2437 // No special implementation for the SMP subtraction assignment to sparse vectors.
2438 //**********************************************************************************************
2439
2440 //**SMP multiplication assignment to dense vectors**********************************************
2455 template< typename VT1 > // Type of the target dense vector
2456 friend inline auto smpMultAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2457 -> EnableIf_t< UseSMPAssign_v<VT1> >
2458 {
2460
2464
2465 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2466
2467 const ResultType tmp( rhs );
2468 smpMultAssign( *lhs, tmp );
2469 }
2471 //**********************************************************************************************
2472
2473 //**SMP multiplication assignment to sparse vectors*********************************************
2474 // No special implementation for the SMP multiplication assignment to sparse vectors.
2475 //**********************************************************************************************
2476
2477 //**SMP division assignment to dense vectors****************************************************
2492 template< typename VT1 > // Type of the target dense vector
2493 friend inline auto smpDivAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2494 -> EnableIf_t< UseSMPAssign_v<VT1> >
2495 {
2497
2501
2502 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2503
2504 const ResultType tmp( rhs );
2505 smpDivAssign( *lhs, tmp );
2506 }
2508 //**********************************************************************************************
2509
2510 //**SMP division assignment to sparse vectors***************************************************
2511 // No special implementation for the SMP division assignment to sparse vectors.
2512 //**********************************************************************************************
2513
2514 //**Compile time checks*************************************************************************
2522 //**********************************************************************************************
2523};
2524//*************************************************************************************************
2525
2526
2527
2528
2529//=================================================================================================
2530//
2531// DVECSCALARMULTEXPR SPECIALIZATION
2532//
2533//=================================================================================================
2534
2535//*************************************************************************************************
2543template< typename VT // Type of the left-hand side dense vector
2544 , typename MT // Type of the right-hand side dense matrix
2545 , typename ST > // Type of the side scalar value
2546class DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >
2547 : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true > >
2548 , private Computation
2549{
2550 private:
2551 //**Type definitions****************************************************************************
2552 using VMM = TDVecDMatMultExpr<VT,MT>;
2553 using RES = ResultType_t<VMM>;
2554 using VRT = ResultType_t<VT>;
2555 using MRT = ResultType_t<MT>;
2556 using VET = ElementType_t<VRT>;
2557 using MET = ElementType_t<MRT>;
2558 using VCT = CompositeType_t<VT>;
2559 using MCT = CompositeType_t<MT>;
2560 //**********************************************************************************************
2561
2562 //**********************************************************************************************
2564 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
2565 //**********************************************************************************************
2566
2567 //**********************************************************************************************
2569 static constexpr bool evaluateMatrix =
2570 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2571 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2572 //**********************************************************************************************
2573
2574 //**********************************************************************************************
2576
2579 template< typename T1 >
2580 static constexpr bool UseSMPAssign_v = ( evaluateVector || evaluateMatrix );
2581 //**********************************************************************************************
2582
2583 //**********************************************************************************************
2585
2587 template< typename T1, typename T2, typename T3, typename T4 >
2588 static constexpr bool UseBlasKernel_v =
2590 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2591 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2592 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2593 !IsDiagonal_v<T3> &&
2594 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2595 IsBLASCompatible_v< ElementType_t<T1> > &&
2596 IsBLASCompatible_v< ElementType_t<T2> > &&
2597 IsBLASCompatible_v< ElementType_t<T3> > &&
2598 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2599 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2600 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2601 //**********************************************************************************************
2602
2603 //**********************************************************************************************
2605
2608 template< typename T1, typename T2, typename T3, typename T4 >
2609 static constexpr bool UseVectorizedDefaultKernel_v =
2610 ( useOptimizedKernels &&
2611 !IsDiagonal_v<T3> &&
2612 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2613 IsSIMDCombinable_v< ElementType_t<T1>
2614 , ElementType_t<T2>
2615 , ElementType_t<T3>
2616 , T4 > &&
2617 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2618 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2619 //**********************************************************************************************
2620
2621 public:
2622 //**Type definitions****************************************************************************
2624 using This = DVecScalarMultExpr<VMM,ST,true>;
2625
2627 using BaseType = VecScalarMultExpr< DenseVector<This,true> >;
2628
2629 using ResultType = MultTrait_t<RES,ST>;
2630 using TransposeType = TransposeType_t<ResultType>;
2631 using ElementType = ElementType_t<ResultType>;
2632 using SIMDType = SIMDTrait_t<ElementType>;
2633 using ReturnType = const ElementType;
2634 using CompositeType = const ResultType;
2635
2637 using LeftOperand = const TDVecDMatMultExpr<VT,MT>;
2638
2640 using RightOperand = ST;
2641
2643 using LT = If_t< evaluateVector, const VRT, VCT >;
2644
2646 using RT = If_t< evaluateMatrix, const MRT, MCT >;
2647 //**********************************************************************************************
2648
2649 //**Compilation flags***************************************************************************
2651 static constexpr bool simdEnabled =
2652 ( !IsDiagonal_v<MT> &&
2653 VT::simdEnabled && MT::simdEnabled &&
2654 IsSIMDCombinable_v<VET,MET,ST> &&
2655 HasSIMDAdd_v<VET,MET> &&
2656 HasSIMDMult_v<VET,MET> );
2657
2659 static constexpr bool smpAssignable =
2660 ( !evaluateVector && VT::smpAssignable && !evaluateMatrix && MT::smpAssignable );
2661 //**********************************************************************************************
2662
2663 //**SIMD properties*****************************************************************************
2665 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
2666 //**********************************************************************************************
2667
2668 //**Constructor*********************************************************************************
2674 inline DVecScalarMultExpr( const VMM& vector, ST scalar )
2675 : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2676 , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2677 {}
2678 //**********************************************************************************************
2679
2680 //**Subscript operator**************************************************************************
2686 inline ReturnType operator[]( size_t index ) const {
2687 BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2688 return vector_[index] * scalar_;
2689 }
2690 //**********************************************************************************************
2691
2692 //**At function*********************************************************************************
2699 inline ReturnType at( size_t index ) const {
2700 if( index >= vector_.size() ) {
2701 BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2702 }
2703 return (*this)[index];
2704 }
2705 //**********************************************************************************************
2706
2707 //**Size function*******************************************************************************
2712 inline size_t size() const {
2713 return vector_.size();
2714 }
2715 //**********************************************************************************************
2716
2717 //**Left operand access*************************************************************************
2722 inline LeftOperand leftOperand() const {
2723 return vector_;
2724 }
2725 //**********************************************************************************************
2726
2727 //**Right operand access************************************************************************
2732 inline RightOperand rightOperand() const {
2733 return scalar_;
2734 }
2735 //**********************************************************************************************
2736
2737 //**********************************************************************************************
2743 template< typename T >
2744 inline bool canAlias( const T* alias ) const {
2745 return vector_.canAlias( alias );
2746 }
2747 //**********************************************************************************************
2748
2749 //**********************************************************************************************
2755 template< typename T >
2756 inline bool isAliased( const T* alias ) const {
2757 return vector_.isAliased( alias );
2758 }
2759 //**********************************************************************************************
2760
2761 //**********************************************************************************************
2766 inline bool isAligned() const {
2767 return vector_.isAligned();
2768 }
2769 //**********************************************************************************************
2770
2771 //**********************************************************************************************
2776 inline bool canSMPAssign() const noexcept {
2777 RightOperand_t<VMM> A( vector_.rightOperand() );
2778 return ( !BLAZE_BLAS_MODE ||
2781 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2782 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2783 ( size() > SMP_TDVECDMATMULT_THRESHOLD );
2784 }
2785 //**********************************************************************************************
2786
2787 private:
2788 //**Member variables****************************************************************************
2791 //**********************************************************************************************
2792
2793 //**Assignment to dense vectors*****************************************************************
2805 template< typename VT1 > // Type of the target dense vector
2806 friend inline void assign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
2807 {
2809
2810 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2811
2812 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
2813 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
2814
2815 if( right.rows() == 0UL ||
2816 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
2817 reset( *lhs );
2818 return;
2819 }
2820 else if( right.columns() == 0UL ) {
2821 return;
2822 }
2823
2824 LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
2825 RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
2826
2827 BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
2828 BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
2829 BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
2830 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
2831
2832 DVecScalarMultExpr::selectAssignKernel( *lhs, x, A, rhs.scalar_ );
2833 }
2834 //**********************************************************************************************
2835
2836 //**Assignment to dense vectors (kernel selection)**********************************************
2847 template< typename VT1 // Type of the left-hand side target vector
2848 , typename VT2 // Type of the left-hand side vector operand
2849 , typename MT1 // Type of the right-hand side matrix operand
2850 , typename ST2 > // Type of the scalar value
2851 static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2852 {
2853 if( ( IsDiagonal_v<MT1> ) ||
2854 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2855 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2856 selectSmallAssignKernel( y, x, A, scalar );
2857 else
2858 selectBlasAssignKernel( y, x, A, scalar );
2859 }
2860 //**********************************************************************************************
2861
2862 //**Default assignment to dense vectors*********************************************************
2876 template< typename VT1 // Type of the left-hand side target vector
2877 , typename VT2 // Type of the left-hand side vector operand
2878 , typename MT1 // Type of the right-hand side matrix operand
2879 , typename ST2 > // Type of the scalar value
2880 static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2881 {
2882 const size_t M( A.rows() );
2883 const size_t N( A.columns() );
2884
2885 if( IsStrictlyUpper_v<MT1> ) {
2886 reset( y[0] );
2887 }
2888
2889 if( !IsLower_v<MT1> )
2890 {
2891 for( size_t j=( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ); j<N; ++j ) {
2892 y[j] = x[0UL] * A(0UL,j);
2893 }
2894 }
2895
2896 for( size_t i=( IsLower_v<MT1> && !IsStrictlyLower_v<MT1> ? 0UL : 1UL ); i<M; ++i )
2897 {
2898 if( IsDiagonal_v<MT1> )
2899 {
2900 y[i] = x[i] * A(i,i) * scalar;
2901 }
2902 else
2903 {
2904 const size_t jbegin( ( IsUpper_v<MT1> )
2905 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
2906 :( 0UL ) );
2907 const size_t jend( ( IsLower_v<MT1> )
2908 ?( IsStrictlyLower_v<MT1> ? i-1UL : i )
2909 :( N ) );
2910 BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2911
2912 const size_t jnum( jend - jbegin );
2913 const size_t jpos( jbegin + prevMultiple( jnum, 2UL ) );
2914 BLAZE_INTERNAL_ASSERT( jpos <= jbegin+jnum, "Invalid end calculation" );
2915
2916 for( size_t j=jbegin; j<jpos; j+=2UL ) {
2917 y[j ] += x[i] * A(i,j );
2918 y[j+1UL] += x[i] * A(i,j+1UL);
2919 }
2920 if( jpos < jend ) {
2921 y[jpos] += x[i] * A(i,jpos);
2922 }
2923 if( IsLower_v<MT1> ) {
2924 y[jend] = x[i] * A(i,jend);
2925 }
2926 }
2927 }
2928
2929 if( IsStrictlyLower_v<MT1> ) {
2930 reset( y[N-1UL] );
2931 }
2932
2933 if( !IsDiagonal_v<MT1> )
2934 {
2935 const size_t iend( IsStrictlyLower_v<MT1> ? N-1UL : N );
2936 for( size_t j=( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ); j<iend; ++j ) {
2937 y[j] *= scalar;
2938 }
2939 }
2940 }
2941 //**********************************************************************************************
2942
2943 //**Default assignment to dense vectors (small matrices)****************************************
2957 template< typename VT1 // Type of the left-hand side target vector
2958 , typename VT2 // Type of the left-hand side vector operand
2959 , typename MT1 // Type of the right-hand side matrix operand
2960 , typename ST2 > // Type of the scalar value
2961 static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2962 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
2963 {
2964 selectDefaultAssignKernel( y, x, A, scalar );
2965 }
2966 //**********************************************************************************************
2967
2968 //**Default assignment to dense vectors (small matrices)****************************************
2982 template< typename VT1 // Type of the left-hand side target vector
2983 , typename VT2 // Type of the left-hand side vector operand
2984 , typename MT1 // Type of the right-hand side matrix operand
2985 , typename ST2 > // Type of the scalar value
2986 static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2987 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
2988 {
2989 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
2990
2991 const size_t M( A.rows() );
2992 const size_t N( A.columns() );
2993
2994 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
2995 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
2996
2997 const SIMDType factor( set( scalar ) );
2998
2999 size_t j( 0UL );
3000
3001 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3002 {
3003 const size_t ibegin( ( IsLower_v<MT1> )
3004 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3005 :( 0UL ) );
3006 const size_t iend( ( IsUpper_v<MT1> )
3007 ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3008 :( M ) );
3009 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3010
3011 SIMDType x1( set( x[ibegin] ) );
3012 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3013 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE ) );
3014 SIMDType xmm3( x1 * A.load(ibegin,j+SIMDSIZE*2UL) );
3015 SIMDType xmm4( x1 * A.load(ibegin,j+SIMDSIZE*3UL) );
3016 SIMDType xmm5( x1 * A.load(ibegin,j+SIMDSIZE*4UL) );
3017 SIMDType xmm6( x1 * A.load(ibegin,j+SIMDSIZE*5UL) );
3018 SIMDType xmm7( x1 * A.load(ibegin,j+SIMDSIZE*6UL) );
3019 SIMDType xmm8( x1 * A.load(ibegin,j+SIMDSIZE*7UL) );
3020
3021 for( size_t i=ibegin+1UL; i<iend; ++i ) {
3022 x1 = set( x[i] );
3023 xmm1 += x1 * A.load(i,j );
3024 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3025 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3026 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3027 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3028 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3029 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3030 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3031 }
3032
3033 y.store( j , xmm1*factor );
3034 y.store( j+SIMDSIZE , xmm2*factor );
3035 y.store( j+SIMDSIZE*2UL, xmm3*factor );
3036 y.store( j+SIMDSIZE*3UL, xmm4*factor );
3037 y.store( j+SIMDSIZE*4UL, xmm5*factor );
3038 y.store( j+SIMDSIZE*5UL, xmm6*factor );
3039 y.store( j+SIMDSIZE*6UL, xmm7*factor );
3040 y.store( j+SIMDSIZE*7UL, xmm8*factor );
3041 }
3042
3043 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3044 {
3045 const size_t ibegin( ( IsLower_v<MT1> )
3046 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3047 :( 0UL ) );
3048 const size_t iend( ( IsUpper_v<MT1> )
3049 ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3050 :( M ) );
3051 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3052
3053 SIMDType x1( set( x[ibegin] ) );
3054 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3055 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE ) );
3056 SIMDType xmm3( x1 * A.load(ibegin,j+SIMDSIZE*2UL) );
3057 SIMDType xmm4( x1 * A.load(ibegin,j+SIMDSIZE*3UL) );
3058
3059 for( size_t i=ibegin+1UL; i<iend; ++i ) {
3060 x1 = set( x[i] );
3061 xmm1 += x1 * A.load(i,j );
3062 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3063 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3064 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3065 }
3066
3067 y.store( j , xmm1*factor );
3068 y.store( j+SIMDSIZE , xmm2*factor );
3069 y.store( j+SIMDSIZE*2UL, xmm3*factor );
3070 y.store( j+SIMDSIZE*3UL, xmm4*factor );
3071 }
3072
3073 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3074 {
3075 const size_t ibegin( ( IsLower_v<MT1> )
3076 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3077 :( 0UL ) );
3078 const size_t iend( ( IsUpper_v<MT1> )
3079 ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3080 :( M ) );
3081 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3082
3083 SIMDType x1( set( x[ibegin] ) );
3084 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3085 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE ) );
3086 SIMDType xmm3( x1 * A.load(ibegin,j+SIMDSIZE*2UL) );
3087
3088 for( size_t i=ibegin+1UL; i<iend; ++i ) {
3089 x1 = set( x[i] );
3090 xmm1 += x1 * A.load(i,j );
3091 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3092 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3093 }
3094
3095 y.store( j , xmm1*factor );
3096 y.store( j+SIMDSIZE , xmm2*factor );
3097 y.store( j+SIMDSIZE*2UL, xmm3*factor );
3098 }
3099
3100 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3101 {
3102 const size_t ibegin( ( IsLower_v<MT1> )
3103 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3104 :( 0UL ) );
3105 const size_t iend( ( IsUpper_v<MT1> )
3106 ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3107 :( M ) );
3108 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3109
3110 SIMDType x1( set( x[ibegin] ) );
3111 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3112 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE) );
3113
3114 for( size_t i=ibegin+1UL; i<iend; ++i ) {
3115 x1 = set( x[i] );
3116 xmm1 += x1 * A.load(i,j );
3117 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3118 }
3119
3120 y.store( j , xmm1*factor );
3121 y.store( j+SIMDSIZE, xmm2*factor );
3122 }
3123
3124 for( ; j<jpos; j+=SIMDSIZE )
3125 {
3126 const size_t ibegin( ( IsLower_v<MT1> )
3127 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3128 :( 0UL ) );
3129 const size_t iend( ( IsUpper_v<MT1> )
3130 ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3131 :( M ) );
3132 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3133
3134 SIMDType xmm1( set( x[ibegin] ) * A.load(ibegin,j) );
3135
3136 for( size_t i=ibegin+1UL; i<iend; ++i ) {
3137 xmm1 += set( x[i] ) * A.load(i,j);
3138 }
3139
3140 y.store( j, xmm1*factor );
3141 }
3142
3143 for( ; remainder && j<N; ++j )
3144 {
3145 const size_t ibegin( ( IsLower_v<MT1> )?( j ):( 0UL ) );
3146 const size_t iend( ( IsUpper_v<MT1> )?( min( j+1UL, M ) ):( M ) );
3147 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3148
3149 ElementType value( x[ibegin] * A(ibegin,j) );
3150
3151 for( size_t i=ibegin+1UL; i<iend; ++i ) {
3152 value += x[i] * A(i,j);
3153 }
3154
3155 y[j] = value * scalar;
3156 }
3157 }
3158 //**********************************************************************************************
3159
3160 //**Default assignment to dense vectors (large matrices)****************************************
3174 template< typename VT1 // Type of the left-hand side target vector
3175 , typename VT2 // Type of the left-hand side vector operand
3176 , typename MT1 // Type of the right-hand side matrix operand
3177 , typename ST2 > // Type of the scalar value
3178 static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3179 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3180 {
3181 selectDefaultAssignKernel( y, x, A, scalar );
3182 }
3183 //**********************************************************************************************
3184
3185 //**Default assignment to dense vectors (large matrices)****************************************
3199 template< typename VT1 // Type of the left-hand side target vector
3200 , typename VT2 // Type of the left-hand side vector operand
3201 , typename MT1 // Type of the right-hand side matrix operand
3202 , typename ST2 > // Type of the scalar value
3203 static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3204 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3205 {
3206 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3207
3208 const size_t M( A.rows() );
3209 const size_t N( A.columns() );
3210
3211 const size_t jblock( 32768UL / sizeof( ElementType ) );
3212 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3213
3214 const SIMDType factor( set( scalar ) );
3215
3216 BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3217
3218 reset( y );
3219
3220 for( size_t jj=0U; jj<N; jj+=jblock ) {
3221 for( size_t ii=0UL; ii<M; ii+=iblock )
3222 {
3223 const size_t iend( min( ii+iblock, M ) );
3224 const size_t jtmp( min( jj+jblock, N ) );
3225 const size_t jend( ( IsLower_v<MT1> )
3226 ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
3227 :( jtmp ) );
3228
3229 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
3230 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
3231
3232 size_t j( ( IsUpper_v<MT1> )
3233 ?( max( jj, prevMultiple( ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ), SIMDSIZE ) ) )
3234 :( jj ) );
3235
3236 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3237 {
3238 SIMDType x1( set( x[ii] ) );
3239 SIMDType xmm1( x1 * A.load(ii,j ) );
3240 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
3241 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
3242 SIMDType xmm4( x1 * A.load(ii,j+SIMDSIZE*3UL) );
3243 SIMDType xmm5( x1 * A.load(ii,j+SIMDSIZE*4UL) );
3244 SIMDType xmm6( x1 * A.load(ii,j+SIMDSIZE*5UL) );
3245 SIMDType xmm7( x1 * A.load(ii,j+SIMDSIZE*6UL) );
3246 SIMDType xmm8( x1 * A.load(ii,j+SIMDSIZE*7UL) );
3247
3248 for( size_t i=ii+1UL; i<iend; ++i ) {
3249 x1 = set( x[i] );
3250 xmm1 += x1 * A.load(i,j );
3251 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3252 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3253 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3254 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3255 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3256 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3257 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3258 }
3259
3260 y.store( j , y.load(j ) + xmm1*factor );
3261 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3262 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3263 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3264 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3265 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3266 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3267 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3268 }
3269
3270 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3271 {
3272 SIMDType x1( set( x[ii] ) );
3273 SIMDType xmm1( x1 * A.load(ii,j ) );
3274 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
3275 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
3276 SIMDType xmm4( x1 * A.load(ii,j+SIMDSIZE*3UL) );
3277
3278 for( size_t i=ii+1UL; i<iend; ++i ) {
3279 x1 = set( x[i] );
3280 xmm1 += x1 * A.load(i,j );
3281 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3282 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3283 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3284 }
3285
3286 y.store( j , y.load(j ) + xmm1*factor );
3287 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3288 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3289 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3290 }
3291
3292 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3293 {
3294 SIMDType x1( set( x[ii] ) );
3295 SIMDType xmm1( x1 * A.load(ii,j ) );
3296 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
3297 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
3298
3299 for( size_t i=ii+1UL; i<iend; ++i ) {
3300 x1 = set( x[i] );
3301 xmm1 += x1 * A.load(i,j );
3302 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3303 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3304 }
3305
3306 y.store( j , y.load(j ) + xmm1*factor );
3307 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3308 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3309 }
3310
3311 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3312 {
3313 SIMDType x1( set( x[ii] ) );
3314 SIMDType xmm1( x1 * A.load(ii,j ) );
3315 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE) );
3316
3317 for( size_t i=ii+1UL; i<iend; ++i ) {
3318 x1 = set( x[i] );
3319 xmm1 += x1 * A.load(i,j );
3320 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3321 }
3322
3323 y.store( j , y.load(j ) + xmm1*factor );
3324 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3325 }
3326
3327 for( ; j<jpos; j+=SIMDSIZE )
3328 {
3329 SIMDType xmm1( set( x[ii] ) * A.load(ii,j) );
3330
3331 for( size_t i=ii+1UL; i<iend; ++i ) {
3332 xmm1 += set( x[i] ) * A.load(i,j);
3333 }
3334
3335 y.store( j, y.load(j) + xmm1*factor );
3336 }
3337
3338 for( ; remainder && j<jend; ++j )
3339 {
3340 ElementType value( x[ii] * A(ii,j) );
3341
3342 for( size_t i=ii+1UL; i<iend; ++i ) {
3343 value += x[i] * A(i,j);
3344 }
3345
3346 y[j] += value * scalar;
3347 }
3348 }
3349 }
3350 }
3351 //**********************************************************************************************
3352
3353 //**BLAS-based assignment to dense vectors (default)********************************************
3366 template< typename VT1 // Type of the left-hand side target vector
3367 , typename VT2 // Type of the left-hand side vector operand
3368 , typename MT1 // Type of the right-hand side matrix operand
3369 , typename ST2 > // Type of the scalar value
3370 static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3371 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3372 {
3373 selectLargeAssignKernel( y, x, A, scalar );
3374 }
3375 //**********************************************************************************************
3376
3377 //**BLAS-based assignment to dense vectors******************************************************
3378#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3392 template< typename VT1 // Type of the left-hand side target vector
3393 , typename VT2 // Type of the left-hand side vector operand
3394 , typename MT1 // Type of the right-hand side matrix operand
3395 , typename ST2 > // Type of the scalar value
3396 static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3397 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3398 {
3399 using ET = ElementType_t<VT1>;
3400
3401 if( IsTriangular_v<MT1> ) {
3402 assign( y, scalar * x );
3403 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3404 }
3405 else {
3406 gemv( y, x, A, ET(scalar), ET(0) );
3407 }
3408 }
3409#endif
3410 //**********************************************************************************************
3411
3412 //**Assignment to sparse vectors****************************************************************
3424 template< typename VT1 > // Type of the target sparse vector
3425 friend inline void assign( SparseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3426 {
3428
3432
3433 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
3434
3435 const ResultType tmp( serial( rhs ) );
3436 assign( *lhs, tmp );
3437 }
3438 //**********************************************************************************************
3439
3440 //**Addition assignment to dense vectors********************************************************
3452 template< typename VT1 > // Type of the target dense vector
3453 friend inline void addAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3454 {
3456
3457 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
3458
3459 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3460 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3461
3462 if( right.rows() == 0UL || right.columns() == 0UL ||
3463 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
3464 return;
3465 }
3466
3467 LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3468 RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3469
3470 BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3471 BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3472 BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3473 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
3474
3475 DVecScalarMultExpr::selectAddAssignKernel( *lhs, x, A, rhs.scalar_ );
3476 }
3477 //**********************************************************************************************
3478
3479 //**Addition assignment to dense vectors (kernel selection)*************************************
3490 template< typename VT1 // Type of the left-hand side target vector
3491 , typename VT2 // Type of the left-hand side vector operand
3492 , typename MT1 // Type of the right-hand side matrix operand
3493 , typename ST2 > // Type of the scalar value
3494 static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3495 {
3496 if( ( IsDiagonal_v<MT1> ) ||
3497 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3498 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3499 selectSmallAddAssignKernel( y, x, A, scalar );
3500 else
3501 selectBlasAddAssignKernel( y, x, A, scalar );
3502 }
3503 //**********************************************************************************************
3504
3505 //**Default addition assignment to dense vectors************************************************
3519 template< typename VT1 // Type of the left-hand side target vector
3520 , typename VT2 // Type of the left-hand side vector operand
3521 , typename MT1 // Type of the right-hand side matrix operand
3522 , typename ST2 > // Type of the scalar value
3523 static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3524 {
3525 y.addAssign( x * A * scalar );
3526 }
3527 //**********************************************************************************************
3528
3529 //**Default addition assignment to dense vectors (small matrices)*******************************
3543 template< typename VT1 // Type of the left-hand side target vector
3544 , typename VT2 // Type of the left-hand side vector operand
3545 , typename MT1 // Type of the right-hand side matrix operand
3546 , typename ST2 > // Type of the scalar value
3547 static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3548 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3549 {
3550 selectDefaultAddAssignKernel( y, x, A, scalar );
3551 }
3552 //**********************************************************************************************
3553
3554 //**Vectorized default addition assignment to dense vectors (small matrices)********************
3569 template< typename VT1 // Type of the left-hand side target vector
3570 , typename VT2 // Type of the left-hand side vector operand
3571 , typename MT1 // Type of the right-hand side matrix operand
3572 , typename ST2 > // Type of the scalar value
3573 static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3574 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3575 {
3576 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3577
3578 const size_t M( A.rows() );
3579 const size_t N( A.columns() );
3580
3581 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
3582 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
3583
3584 const SIMDType factor( set( scalar ) );
3585
3586 size_t j( 0UL );
3587
3588 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3589 {
3590 const size_t ibegin( ( IsLower_v<MT1> )
3591 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3592 :( 0UL ) );
3593 const size_t iend( ( IsUpper_v<MT1> )
3594 ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3595 :( M ) );
3596 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3597
3598 SIMDType x1( set( x[ibegin] ) );
3599 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3600 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE ) );
3601 SIMDType xmm3( x1 * A.load(ibegin,j+SIMDSIZE*2UL) );
3602 SIMDType xmm4( x1 * A.load(ibegin,j+SIMDSIZE*3UL) );
3603 SIMDType xmm5( x1 * A.load(ibegin,j+SIMDSIZE*4UL) );
3604 SIMDType xmm6( x1 * A.load(ibegin,j+SIMDSIZE*5UL) );
3605 SIMDType xmm7( x1 * A.load(ibegin,j+SIMDSIZE*6UL) );
3606 SIMDType xmm8( x1 * A.load(ibegin,j+SIMDSIZE*7UL) );
3607
3608 for( size_t i=ibegin+1UL; i<iend; ++i ) {
3609 x1 = set( x[i] );
3610 xmm1 += x1 * A.load(i,j );
3611 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3612 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3613 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3614 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3615 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3616 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3617 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3618 }
3619
3620 y.store( j , y.load(j ) + xmm1*factor );
3621 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3622 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3623 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3624 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3625 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3626 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3627 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3628 }
3629
3630 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3631 {
3632 const size_t ibegin( ( IsLower_v<MT1> )
3633 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3634 :( 0UL ) );
3635 const size_t iend( ( IsUpper_v<MT1> )
3636 ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3637 :( M ) );
3638 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3639
3640 SIMDType x1( set( x[ibegin] ) );
3641 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3642 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE ) );
3643 SIMDType xmm3( x1 * A.load(ibegin,j+SIMDSIZE*2UL) );
3644 SIMDType xmm4( x1 * A.load(ibegin,j+SIMDSIZE*3UL) );
3645
3646 for( size_t i=ibegin+1UL; i<iend; ++i ) {
3647 x1 = set( x[i] );
3648 xmm1 += x1 * A.load(i,j );
3649 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3650 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3651 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3652 }
3653
3654 y.store( j , y.load(j ) + xmm1*factor );
3655 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3656 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3657 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3658 }
3659
3660 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3661 {
3662 const size_t ibegin( ( IsLower_v<MT1> )
3663 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3664 :( 0UL ) );
3665 const size_t iend( ( IsUpper_v<MT1> )
3666 ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3667 :( M ) );
3668 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3669
3670 SIMDType x1( set( x[ibegin] ) );
3671 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3672 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE ) );
3673 SIMDType xmm3( x1 * A.load(ibegin,j+SIMDSIZE*2UL) );
3674
3675 for( size_t i=ibegin+1UL; i<iend; ++i ) {
3676 x1 = set( x[i] );
3677 xmm1 += x1 * A.load(i,j );
3678 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3679 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3680 }
3681
3682 y.store( j , y.load(j ) + xmm1*factor );
3683 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3684 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3685 }
3686
3687 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3688 {
3689 const size_t ibegin( ( IsLower_v<MT1> )
3690 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3691 :( 0UL ) );
3692 const size_t iend( ( IsUpper_v<MT1> )
3693 ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3694 :( M ) );
3695 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3696
3697 SIMDType x1( set( x[ibegin] ) );
3698 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3699 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE) );
3700
3701 for( size_t i=ibegin+1UL; i<iend; ++i ) {
3702 x1 = set( x[i] );
3703 xmm1 += x1 * A.load(i,j );
3704 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3705 }
3706
3707 y.store( j , y.load(j ) + xmm1*factor );
3708 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3709 }
3710
3711 for( ; j<jpos; j+=SIMDSIZE )
3712 {
3713 const size_t ibegin( ( IsLower_v<MT1> )
3714 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3715 :( 0UL ) );
3716 const size_t iend( ( IsUpper_v<MT1> )
3717 ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3718 :( M ) );
3719 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3720
3721 SIMDType xmm1( set( x[ibegin] ) * A.load(ibegin,j) );
3722
3723 for( size_t i=ibegin+1UL; i<iend; ++i ) {
3724 xmm1 += set( x[i] ) * A.load(i,j);
3725 }
3726
3727 y.store( j, y.load(j) + xmm1*factor );
3728 }
3729
3730 for( ; remainder && j<N; ++j )
3731 {
3732 const size_t ibegin( ( IsLower_v<MT1> )?( j ):( 0UL ) );
3733 const size_t iend( ( IsUpper_v<MT1> )?( min( j+1UL, M ) ):( M ) );
3734 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3735
3736 ElementType value( x[ibegin] * A(ibegin,j) );
3737
3738 for( size_t i=ibegin+1UL; i<iend; ++i ) {
3739 value += x[i] * A(i,j);
3740 }
3741
3742 y[j] += value * scalar;
3743 }
3744 }
3745 //**********************************************************************************************
3746
3747 //**Default addition assignment to dense vectors (large matrices)*******************************
3761 template< typename VT1 // Type of the left-hand side target vector
3762 , typename VT2 // Type of the left-hand side vector operand
3763 , typename MT1 // Type of the right-hand side matrix operand
3764 , typename ST2 > // Type of the scalar value
3765 static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3766 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3767 {
3768 selectDefaultAddAssignKernel( y, x, A, scalar );
3769 }
3770 //**********************************************************************************************
3771
3772 //**Vectorized default addition assignment to dense vectors (large matrices)********************
3787 template< typename VT1 // Type of the left-hand side target vector
3788 , typename VT2 // Type of the left-hand side vector operand
3789 , typename MT1 // Type of the right-hand side matrix operand
3790 , typename ST2 > // Type of the scalar value
3791 static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3792 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3793 {
3794 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3795
3796 const size_t M( A.rows() );
3797 const size_t N( A.columns() );
3798
3799 const size_t jblock( 32768UL / sizeof( ElementType ) );
3800 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3801
3802 const SIMDType factor( set( scalar ) );
3803
3804 BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3805
3806 for( size_t jj=0U; jj<N; jj+=jblock ) {
3807 for( size_t ii=0UL; ii<M; ii+=iblock )
3808 {
3809 const size_t iend( min( ii+iblock, M ) );
3810 const size_t jtmp( min( jj+jblock, N ) );
3811 const size_t jend( ( IsLower_v<MT1> )
3812 ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
3813 :( jtmp ) );
3814
3815 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
3816 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
3817
3818 size_t j( ( IsUpper_v<MT1> )
3819 ?( max( jj, prevMultiple( ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ), SIMDSIZE ) ) )
3820 :( jj ) );
3821
3822 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3823 {
3824 SIMDType x1( set( x[ii] ) );
3825 SIMDType xmm1( x1 * A.load(ii,j ) );
3826 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
3827 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
3828 SIMDType xmm4( x1 * A.load(ii,j+SIMDSIZE*3UL) );
3829 SIMDType xmm5( x1 * A.load(ii,j+SIMDSIZE*4UL) );
3830 SIMDType xmm6( x1 * A.load(ii,j+SIMDSIZE*5UL) );
3831 SIMDType xmm7( x1 * A.load(ii,j+SIMDSIZE*6UL) );
3832 SIMDType xmm8( x1 * A.load(ii,j+SIMDSIZE*7UL) );
3833
3834 for( size_t i=ii+1UL; i<iend; ++i ) {
3835 x1 = set( x[i] );
3836 xmm1 += x1 * A.load(i,j );
3837 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3838 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3839 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3840 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3841 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3842 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3843 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3844 }
3845
3846 y.store( j , y.load(j ) + xmm1*factor );
3847 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3848 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3849 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3850 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3851 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3852 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3853 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3854 }
3855
3856 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3857 {
3858 SIMDType x1( set( x[ii] ) );
3859 SIMDType xmm1( x1 * A.load(ii,j ) );
3860 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
3861 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
3862 SIMDType xmm4( x1 * A.load(ii,j+SIMDSIZE*3UL) );
3863
3864 for( size_t i=ii+1UL; i<iend; ++i ) {
3865 x1 = set( x[i] );
3866 xmm1 += x1 * A.load(i,j );
3867 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3868 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3869 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3870 }
3871
3872 y.store( j , y.load(j ) + xmm1*factor );
3873 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3874 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3875 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3876 }
3877
3878 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3879 {
3880 SIMDType x1( set( x[ii] ) );
3881 SIMDType xmm1( x1 * A.load(ii,j ) );
3882 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
3883 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
3884
3885 for( size_t i=ii+1UL; i<iend; ++i ) {
3886 x1 = set( x[i] );
3887 xmm1 += x1 * A.load(i,j );
3888 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3889 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3890 }
3891
3892 y.store( j , y.load(j ) + xmm1*factor );
3893 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3894 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3895 }
3896
3897 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3898 {
3899 SIMDType x1( set( x[ii] ) );
3900 SIMDType xmm1( x1 * A.load(ii,j ) );
3901 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE) );
3902
3903 for( size_t i=ii+1UL; i<iend; ++i ) {
3904 x1 = set( x[i] );
3905 xmm1 += x1 * A.load(i,j );
3906 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3907 }
3908
3909 y.store( j , y.load(j ) + xmm1*factor );
3910 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3911 }
3912
3913 for( ; j<jpos; j+=SIMDSIZE )
3914 {
3915 SIMDType xmm1( set( x[ii] ) * A.load(ii,j) );
3916
3917 for( size_t i=ii+1UL; i<iend; ++i ) {
3918 xmm1 += set( x[i] ) * A.load(i,j);
3919 }
3920
3921 y.store( j, y.load(j) + xmm1*factor );
3922 }
3923
3924 for( ; remainder && j<jend; ++j )
3925 {
3926 ElementType value( x[ii] * A(ii,j) );
3927
3928 for( size_t i=ii+1UL; i<iend; ++i ) {
3929 value += x[i] * A(i,j);
3930 }
3931
3932 y[j] += value * scalar;
3933 }
3934 }
3935 }
3936 }
3937 //**********************************************************************************************
3938
3939 //**BLAS-based addition assignment to dense vectors (default)***********************************
3953 template< typename VT1 // Type of the left-hand side target vector
3954 , typename VT2 // Type of the left-hand side vector operand
3955 , typename MT1 // Type of the right-hand side matrix operand
3956 , typename ST2 > // Type of the scalar value
3957 static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3958 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3959 {
3960 selectLargeAddAssignKernel( y, x, A, scalar );
3961 }
3962 //**********************************************************************************************
3963
3964 //**BLAS-based addition assignment to dense vectors*********************************************
3965#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3979 template< typename VT1 // Type of the left-hand side target vector
3980 , typename VT2 // Type of the left-hand side vector operand
3981 , typename MT1 // Type of the right-hand side matrix operand
3982 , typename ST2 > // Type of the scalar value
3983 static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3984 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3985 {
3986 using ET = ElementType_t<VT1>;
3987
3988 if( IsTriangular_v<MT1> ) {
3989 ResultType_t<VT1> tmp( serial( scalar * x ) );
3990 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3991 addAssign( y, tmp );
3992 }
3993 else {
3994 gemv( y, x, A, ET(scalar), ET(1) );
3995 }
3996 }
3997#endif
3998 //**********************************************************************************************
3999
4000 //**Addition assignment to sparse vectors*******************************************************
4001 // No special implementation for the addition assignment to sparse vectors.
4002 //**********************************************************************************************
4003
4004 //**Subtraction assignment to dense vectors*****************************************************
4016 template< typename VT1 > // Type of the target dense vector
4017 friend inline void subAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4018 {
4020
4021 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4022
4023 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4024 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4025
4026 if( right.rows() == 0UL || right.columns() == 0UL ||
4027 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
4028 return;
4029 }
4030
4031 LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
4032 RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4033
4034 BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4035 BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4036 BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4037 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
4038
4039 DVecScalarMultExpr::selectSubAssignKernel( *lhs, x, A, rhs.scalar_ );
4040 }
4041 //**********************************************************************************************
4042
4043 //**Subtraction assignment to dense vectors (kernel selection)**********************************
4054 template< typename VT1 // Type of the left-hand side target vector
4055 , typename VT2 // Type of the left-hand side vector operand
4056 , typename MT1 // Type of the right-hand side matrix operand
4057 , typename ST2 > // Type of the scalar value
4058 static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4059 {
4060 if( ( IsDiagonal_v<MT1> ) ||
4061 ( IsComputation_v<MT> && !evaluateMatrix ) ||
4062 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4063 selectSmallSubAssignKernel( y, x, A, scalar );
4064 else
4065 selectBlasSubAssignKernel( y, x, A, scalar );
4066 }
4067 //**********************************************************************************************
4068
4069 //**Default subtraction assignment to dense vectors*********************************************
4083 template< typename VT1 // Type of the left-hand side target vector
4084 , typename VT2 // Type of the left-hand side vector operand
4085 , typename MT1 // Type of the right-hand side matrix operand
4086 , typename ST2 > // Type of the scalar value
4087 static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4088 {
4089 y.subAssign( x * A * scalar );
4090 }
4091 //**********************************************************************************************
4092
4093 //**Default subtraction assignment to dense vectors (small matrices)****************************
4107 template< typename VT1 // Type of the left-hand side target vector
4108 , typename VT2 // Type of the left-hand side vector operand
4109 , typename MT1 // Type of the right-hand side matrix operand
4110 , typename ST2 > // Type of the scalar value
4111 static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4112 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4113 {
4114 selectDefaultSubAssignKernel( y, x, A, scalar );
4115 }
4116 //**********************************************************************************************
4117
4118 //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
4133 template< typename VT1 // Type of the left-hand side target vector
4134 , typename VT2 // Type of the left-hand side vector operand
4135 , typename MT1 // Type of the right-hand side matrix operand
4136 , typename ST2 > // Type of the scalar value
4137 static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4138 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4139 {
4140 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
4141
4142 const size_t M( A.rows() );
4143 const size_t N( A.columns() );
4144
4145 const size_t jpos( remainder ? prevMultiple( N, SIMDSIZE ) : N );
4146 BLAZE_INTERNAL_ASSERT( jpos <= N, "Invalid end calculation" );
4147
4148 const SIMDType factor( set( scalar ) );
4149
4150 size_t j( 0UL );
4151
4152 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4153 {
4154 const size_t ibegin( ( IsLower_v<MT1> )
4155 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4156 :( 0UL ) );
4157 const size_t iend( ( IsUpper_v<MT1> )
4158 ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4159 :( M ) );
4160 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4161
4162 SIMDType x1( set( x[ibegin] ) );
4163 SIMDType xmm1( x1 * A.load(ibegin,j ) );
4164 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE ) );
4165 SIMDType xmm3( x1 * A.load(ibegin,j+SIMDSIZE*2UL) );
4166 SIMDType xmm4( x1 * A.load(ibegin,j+SIMDSIZE*3UL) );
4167 SIMDType xmm5( x1 * A.load(ibegin,j+SIMDSIZE*4UL) );
4168 SIMDType xmm6( x1 * A.load(ibegin,j+SIMDSIZE*5UL) );
4169 SIMDType xmm7( x1 * A.load(ibegin,j+SIMDSIZE*6UL) );
4170 SIMDType xmm8( x1 * A.load(ibegin,j+SIMDSIZE*7UL) );
4171
4172 for( size_t i=ibegin+1UL; i<iend; ++i ) {
4173 x1 = set( x[i] );
4174 xmm1 += x1 * A.load(i,j );
4175 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4176 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4177 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4178 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4179 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4180 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4181 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4182 }
4183
4184 y.store( j , y.load(j ) - xmm1*factor );
4185 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4186 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4187 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4188 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4189 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4190 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4191 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4192 }
4193
4194 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4195 {
4196 const size_t ibegin( ( IsLower_v<MT1> )
4197 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4198 :( 0UL ) );
4199 const size_t iend( ( IsUpper_v<MT1> )
4200 ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4201 :( M ) );
4202 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4203
4204 SIMDType x1( set( x[ibegin] ) );
4205 SIMDType xmm1( x1 * A.load(ibegin,j ) );
4206 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE ) );
4207 SIMDType xmm3( x1 * A.load(ibegin,j+SIMDSIZE*2UL) );
4208 SIMDType xmm4( x1 * A.load(ibegin,j+SIMDSIZE*3UL) );
4209
4210 for( size_t i=ibegin+1UL; i<iend; ++i ) {
4211 x1 = set( x[i] );
4212 xmm1 += x1 * A.load(i,j );
4213 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4214 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4215 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4216 }
4217
4218 y.store( j , y.load(j ) - xmm1*factor );
4219 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4220 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4221 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4222 }
4223
4224 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4225 {
4226 const size_t ibegin( ( IsLower_v<MT1> )
4227 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4228 :( 0UL ) );
4229 const size_t iend( ( IsUpper_v<MT1> )
4230 ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4231 :( M ) );
4232 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4233
4234 SIMDType x1( set( x[ibegin] ) );
4235 SIMDType xmm1( x1 * A.load(ibegin,j ) );
4236 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE ) );
4237 SIMDType xmm3( x1 * A.load(ibegin,j+SIMDSIZE*2UL) );
4238
4239 for( size_t i=ibegin+1UL; i<iend; ++i ) {
4240 x1 = set( x[i] );
4241 xmm1 += x1 * A.load(i,j );
4242 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4243 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4244 }
4245
4246 y.store( j , y.load(j ) - xmm1*factor );
4247 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4248 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4249 }
4250
4251 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4252 {
4253 const size_t ibegin( ( IsLower_v<MT1> )
4254 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4255 :( 0UL ) );
4256 const size_t iend( ( IsUpper_v<MT1> )
4257 ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4258 :( M ) );
4259 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4260
4261 SIMDType x1( set( x[ibegin] ) );
4262 SIMDType xmm1( x1 * A.load(ibegin,j ) );
4263 SIMDType xmm2( x1 * A.load(ibegin,j+SIMDSIZE) );
4264
4265 for( size_t i=ibegin+1UL; i<iend; ++i ) {
4266 x1 = set( x[i] );
4267 xmm1 += x1 * A.load(i,j );
4268 xmm2 += x1 * A.load(i,j+SIMDSIZE);
4269 }
4270
4271 y.store( j , y.load(j ) - xmm1*factor );
4272 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4273 }
4274
4275 for( ; j<jpos; j+=SIMDSIZE )
4276 {
4277 const size_t ibegin( ( IsLower_v<MT1> )
4278 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4279 :( 0UL ) );
4280 const size_t iend( ( IsUpper_v<MT1> )
4281 ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4282 :( M ) );
4283 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4284
4285 SIMDType xmm1( set( x[ibegin] ) * A.load(ibegin,j) );
4286
4287 for( size_t i=ibegin+1UL; i<iend; ++i ) {
4288 xmm1 += set( x[i] ) * A.load(i,j);
4289 }
4290
4291 y.store( j, y.load(j) - xmm1*factor );
4292 }
4293
4294 for( ; remainder && j<N; ++j )
4295 {
4296 const size_t ibegin( ( IsLower_v<MT1> )?( j ):( 0UL ) );
4297 const size_t iend( ( IsUpper_v<MT1> )?( min( j+1UL, M ) ):( M ) );
4298 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4299
4300 ElementType value( x[ibegin] * A(ibegin,j) );
4301
4302 for( size_t i=ibegin+1UL; i<iend; ++i ) {
4303 value += x[i] * A(i,j);
4304 }
4305
4306 y[j] -= value * scalar;
4307 }
4308 }
4309 //**********************************************************************************************
4310
4311 //**Default subtraction assignment to dense vectors (large matrices)****************************
4325 template< typename VT1 // Type of the left-hand side target vector
4326 , typename VT2 // Type of the left-hand side vector operand
4327 , typename MT1 // Type of the right-hand side matrix operand
4328 , typename ST2 > // Type of the scalar value
4329 static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4330 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4331 {
4332 selectDefaultSubAssignKernel( y, x, A, scalar );
4333 }
4334 //**********************************************************************************************
4335
4336 //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4351 template< typename VT1 // Type of the left-hand side target vector
4352 , typename VT2 // Type of the left-hand side vector operand
4353 , typename MT1 // Type of the right-hand side matrix operand
4354 , typename ST2 > // Type of the scalar value
4355 static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4356 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4357 {
4358 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
4359
4360 const size_t M( A.rows() );
4361 const size_t N( A.columns() );
4362
4363 const size_t jblock( 32768UL / sizeof( ElementType ) );
4364 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4365
4366 const SIMDType factor( set( scalar ) );
4367
4368 BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
4369
4370 for( size_t jj=0U; jj<N; jj+=jblock ) {
4371 for( size_t ii=0UL; ii<M; ii+=iblock )
4372 {
4373 const size_t iend( min( ii+iblock, M ) );
4374 const size_t jtmp( min( jj+jblock, N ) );
4375 const size_t jend( ( IsLower_v<MT1> )
4376 ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
4377 :( jtmp ) );
4378
4379 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
4380 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
4381
4382 size_t j( ( IsUpper_v<MT1> )
4383 ?( max( jj, prevMultiple( ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ), SIMDSIZE ) ) )
4384 :( jj ) );
4385
4386 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4387 {
4388 SIMDType x1( set( x[ii] ) );
4389 SIMDType xmm1( x1 * A.load(ii,j ) );
4390 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
4391 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
4392 SIMDType xmm4( x1 * A.load(ii,j+SIMDSIZE*3UL) );
4393 SIMDType xmm5( x1 * A.load(ii,j+SIMDSIZE*4UL) );
4394 SIMDType xmm6( x1 * A.load(ii,j+SIMDSIZE*5UL) );
4395 SIMDType xmm7( x1 * A.load(ii,j+SIMDSIZE*6UL) );
4396 SIMDType xmm8( x1 * A.load(ii,j+SIMDSIZE*7UL) );
4397
4398 for( size_t i=ii+1UL; i<iend; ++i ) {
4399 x1 = set( x[i] );
4400 xmm1 += x1 * A.load(i,j );
4401 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4402 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4403 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4404 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4405 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4406 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4407 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4408 }
4409
4410 y.store( j , y.load(j ) - xmm1*factor );
4411 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4412 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4413 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4414 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4415 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4416 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4417 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4418 }
4419
4420 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4421 {
4422 SIMDType x1( set( x[ii] ) );
4423 SIMDType xmm1( x1 * A.load(ii,j ) );
4424 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
4425 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
4426 SIMDType xmm4( x1 * A.load(ii,j+SIMDSIZE*3UL) );
4427
4428 for( size_t i=ii+1UL; i<iend; ++i ) {
4429 x1 = set( x[i] );
4430 xmm1 += x1 * A.load(i,j );
4431 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4432 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4433 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4434 }
4435
4436 y.store( j , y.load(j ) - xmm1*factor );
4437 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4438 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4439 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4440 }
4441
4442 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4443 {
4444 SIMDType x1( set( x[ii] ) );
4445 SIMDType xmm1( x1 * A.load(ii,j ) );
4446 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE ) );
4447 SIMDType xmm3( x1 * A.load(ii,j+SIMDSIZE*2UL) );
4448
4449 for( size_t i=ii+1UL; i<iend; ++i ) {
4450 x1 = set( x[i] );
4451 xmm1 += x1 * A.load(i,j );
4452 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4453 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4454 }
4455
4456 y.store( j , y.load(j ) - xmm1*factor );
4457 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4458 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4459 }
4460
4461 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4462 {
4463 SIMDType x1( set( x[ii] ) );
4464 SIMDType xmm1( x1 * A.load(ii,j ) );
4465 SIMDType xmm2( x1 * A.load(ii,j+SIMDSIZE) );
4466
4467 for( size_t i=ii+1UL; i<iend; ++i ) {
4468 x1 = set( x[i] );
4469 xmm1 += x1 * A.load(i,j );
4470 xmm2 += x1 * A.load(i,j+SIMDSIZE);
4471 }
4472
4473 y.store( j , y.load(j ) - xmm1*factor );
4474 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4475 }
4476
4477 for( ; j<jpos; j+=SIMDSIZE )
4478 {
4479 SIMDType xmm1( set( x[ii] ) * A.load(ii,j) );
4480
4481 for( size_t i=ii+1UL; i<iend; ++i ) {
4482 xmm1 += set( x[i] ) * A.load(i,j);
4483 }
4484
4485 y.store( j, y.load(j) - xmm1*factor );
4486 }
4487
4488 for( ; remainder && j<jend; ++j )
4489 {
4490 ElementType value( x[ii] * A(ii,j) );
4491
4492 for( size_t i=ii+1UL; i<iend; ++i ) {
4493 value += x[i] * A(i,j);
4494 }
4495
4496 y[j] -= value * scalar;
4497 }
4498 }
4499 }
4500 }
4501 //**********************************************************************************************
4502
4503 //**BLAS-based subtraction assignment to dense vectors (default)********************************
4517 template< typename VT1 // Type of the left-hand side target vector
4518 , typename VT2 // Type of the left-hand side vector operand
4519 , typename MT1 // Type of the right-hand side matrix operand
4520 , typename ST2 > // Type of the scalar value
4521 static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4522 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4523 {
4524 selectLargeSubAssignKernel( y, x, A, scalar );
4525 }
4526 //**********************************************************************************************
4527
4528 //**BLAS-based subtraction assignment to dense vectors******************************************
4529#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4543 template< typename VT1 // Type of the left-hand side target vector
4544 , typename VT2 // Type of the left-hand side vector operand
4545 , typename MT1 // Type of the right-hand side matrix operand
4546 , typename ST2 > // Type of the scalar value
4547 static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4548 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4549 {
4550 using ET = ElementType_t<VT1>;
4551
4552 if( IsTriangular_v<MT1> ) {
4553 ResultType_t<VT1> tmp( serial( scalar * x ) );
4554 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4555 subAssign( y, tmp );
4556 }
4557 else {
4558 gemv( y, x, A, ET(-scalar), ET(1) );
4559 }
4560 }
4561#endif
4562 //**********************************************************************************************
4563
4564 //**Subtraction assignment to sparse vectors****************************************************
4565 // No special implementation for the subtraction assignment to sparse vectors.
4566 //**********************************************************************************************
4567
4568 //**Multiplication assignment to dense vectors**************************************************
4580 template< typename VT1 > // Type of the target dense vector
4581 friend inline void multAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4582 {
4584
4588
4589 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4590
4591 const ResultType tmp( serial( rhs ) );
4592 multAssign( *lhs, tmp );
4593 }
4594 //**********************************************************************************************
4595
4596 //**Multiplication assignment to sparse vectors*************************************************
4597 // No special implementation for the multiplication assignment to sparse vectors.
4598 //**********************************************************************************************
4599
4600 //**Division assignment to dense vectors********************************************************
4612 template< typename VT1 > // Type of the target dense vector
4613 friend inline void divAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4614 {
4616
4620
4621 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4622
4623 const ResultType tmp( serial( rhs ) );
4624 divAssign( *lhs, tmp );
4625 }
4626 //**********************************************************************************************
4627
4628 //**Division assignment to sparse vectors*******************************************************
4629 // No special implementation for the division assignment to sparse vectors.
4630 //**********************************************************************************************
4631
4632 //**SMP assignment to dense vectors*************************************************************
4646 template< typename VT1 > // Type of the target dense vector
4647 friend inline auto smpAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4648 -> EnableIf_t< UseSMPAssign_v<VT1> >
4649 {
4651
4652 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4653
4654 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4655 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4656
4657 if( right.rows() == 0UL ||
4658 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
4659 reset( *lhs );
4660 return;
4661 }
4662 else if( right.columns() == 0UL ) {
4663 return;
4664 }
4665
4666 LT x( left ); // Evaluation of the left-hand side dense vector operand
4667 RT A( right ); // Evaluation of the right-hand side dense matrix operand
4668
4669 BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4670 BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4671 BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4672 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
4673
4674 smpAssign( *lhs, x * A * rhs.scalar_ );
4675 }
4676 //**********************************************************************************************
4677
4678 //**SMP assignment to sparse vectors************************************************************
4692 template< typename VT1 > // Type of the target sparse vector
4693 friend inline auto smpAssign( SparseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4694 -> EnableIf_t< UseSMPAssign_v<VT1> >
4695 {
4697
4701
4702 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4703
4704 const ResultType tmp( rhs );
4705 smpAssign( *lhs, tmp );
4706 }
4707 //**********************************************************************************************
4708
4709 //**SMP addition assignment to dense vectors****************************************************
4723 template< typename VT1 > // Type of the target dense vector
4724 friend inline auto smpAddAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4725 -> EnableIf_t< UseSMPAssign_v<VT1> >
4726 {
4728
4729 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4730
4731 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4732 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4733
4734 if( right.rows() == 0UL || right.columns() == 0UL ||
4735 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
4736 return;
4737 }
4738
4739 LT x( left ); // Evaluation of the left-hand side dense vector operand
4740 RT A( right ); // Evaluation of the right-hand side dense matrix operand
4741
4742 BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4743 BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4744 BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4745 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
4746
4747 smpAddAssign( *lhs, x * A * rhs.scalar_ );
4748 }
4749 //**********************************************************************************************
4750
4751 //**SMP addition assignment to sparse vectors***************************************************
4752 // No special implementation for the SMP addition assignment to sparse vectors.
4753 //**********************************************************************************************
4754
4755 //**SMP subtraction assignment to dense vectors*************************************************
4769 template< typename VT1 > // Type of the target dense vector
4770 friend inline auto smpSubAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4771 -> EnableIf_t< UseSMPAssign_v<VT1> >
4772 {
4774
4775 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4776
4777 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4778 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4779
4780 if( right.rows() == 0UL || right.columns() == 0UL ||
4781 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
4782 return;
4783 }
4784
4785 LT x( left ); // Evaluation of the left-hand side dense vector operand
4786 RT A( right ); // Evaluation of the right-hand side dense matrix operand
4787
4788 BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4789 BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4790 BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4791 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
4792
4793 smpSubAssign( *lhs, x * A * rhs.scalar_ );
4794 }
4795 //**********************************************************************************************
4796
4797 //**SMP subtraction assignment to sparse vectors************************************************
4798 // No special implementation for the SMP subtraction assignment to sparse vectors.
4799 //**********************************************************************************************
4800
4801 //**SMP multiplication assignment to dense vectors**********************************************
4816 template< typename VT1 > // Type of the target dense vector
4817 friend inline auto smpMultAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4818 -> EnableIf_t< UseSMPAssign_v<VT1> >
4819 {
4821
4825
4826 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4827
4828 const ResultType tmp( rhs );
4829 smpMultAssign( *lhs, tmp );
4830 }
4831 //**********************************************************************************************
4832
4833 //**SMP multiplication assignment to sparse vectors*********************************************
4834 // No special implementation for the SMP multiplication assignment to sparse vectors.
4835 //**********************************************************************************************
4836
4837 //**SMP division assignment to dense vectors****************************************************
4851 template< typename VT1 > // Type of the target dense vector
4852 friend inline auto smpDivAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4853 -> EnableIf_t< UseSMPAssign_v<VT1> >
4854 {
4856
4860
4861 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4862
4863 const ResultType tmp( rhs );
4864 smpDivAssign( *lhs, tmp );
4865 }
4866 //**********************************************************************************************
4867
4868 //**SMP division assignment to sparse vectors***************************************************
4869 // No special implementation for the SMP division assignment to sparse vectors.
4870 //**********************************************************************************************
4871
4872 //**Compile time checks*************************************************************************
4881 //**********************************************************************************************
4882};
4884//*************************************************************************************************
4885
4886
4887
4888
4889//=================================================================================================
4890//
4891// GLOBAL BINARY ARITHMETIC OPERATORS
4892//
4893//=================================================================================================
4894
4895//*************************************************************************************************
4926template< typename VT // Type of the left-hand side dense vector
4927 , typename MT > // Type of the right-hand side dense matrix
4928inline decltype(auto)
4929 operator*( const DenseVector<VT,true>& vec, const DenseMatrix<MT,false>& mat )
4930{
4932
4934
4935 if( (*vec).size() != (*mat).rows() ) {
4936 BLAZE_THROW_INVALID_ARGUMENT( "Vector and matrix sizes do not match" );
4937 }
4938
4939 using ReturnType = const TDVecDMatMultExpr<VT,MT>;
4940 return ReturnType( *vec, *mat );
4941}
4942//*************************************************************************************************
4943
4944
4945
4946
4947//=================================================================================================
4948//
4949// ISALIGNED SPECIALIZATIONS
4950//
4951//=================================================================================================
4952
4953//*************************************************************************************************
4955template< typename VT, typename MT >
4956struct IsAligned< TDVecDMatMultExpr<VT,MT> >
4957 : public BoolConstant< IsAligned_v<VT> && IsAligned_v<MT> >
4958{};
4960//*************************************************************************************************
4961
4962} // namespace blaze
4963
4964#endif
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the complex data type.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Deactivation of problematic macros.
Header file for the multiplication trait.
Header file for the prevMultiple shim.
Constraints on the storage order of matrix types.
Constraint on the transpose flag of vector types.
Header file for all SIMD functionality.
Data type constraint.
Constraint on the data type.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:530
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:430
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:461
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:520
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:540
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:584
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:163
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:169
VecScalarMultExpr< DenseVector< This, TF > > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:166
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:440
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:110
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:552
If_t< useAssign, const ResultType, const DVecScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:176
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:179
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:112
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:170
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:435
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:449
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:182
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:574
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:168
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:564
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:173
Base class for dense matrices.
Definition: DenseMatrix.h:82
Base class for N-dimensional dense vectors.
Definition: DenseVector.h:77
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Expression object for transpose dense vector-dense matrix multiplications.
Definition: TDVecDMatMultExpr.h:129
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:269
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDVecDMatMultExpr.h:240
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:348
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:213
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:303
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:392
If_t< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:228
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:216
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:212
ResultType_t< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:132
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:215
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:336
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:147
MultTrait_t< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:211
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDVecDMatMultExpr.h:233
CompositeType_t< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:137
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:255
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:214
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:393
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:360
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:326
If_t< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:225
If_t< IsExpression_v< MT >, const MT, const MT & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:222
ElementType_t< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:135
static constexpr bool evaluateVector
Compilation switch for the composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:142
CompositeType_t< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:136
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:380
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:219
ElementType_t< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:134
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDVecDMatMultExpr.h:246
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:370
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:316
ResultType_t< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:133
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseVector base class.
Header file for the MatMatMultExpr base class.
Header file for the TVecMatMultExpr base class.
Header file for the VecScalarMultExpr base class.
Header file for BLAS general matrix/vector multiplication functions (gemv)
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).
Definition: BLAS.h:169
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1339
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1375
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: RowMajorMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.
Definition: TVecMatMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.
Definition: MatMatMultExpr.h:83
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.
Definition: RowVector.h:61
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.
Definition: MultTrait.h:165
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:221
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector.
Definition: DenseVector.h:192
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
Header file for the exception macros of the math module.
Header file for all forward declarations for expression class templates.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base class for all vector/matrix multiplication expression templates.
Definition: TVecMatMultExpr.h:69
System settings for the BLAS mode.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.
Header file for the generic max algorithm.
Header file for the generic min algorithm.