Blaze 3.9
TDMatDVecMultExpr.h
Go to the documentation of this file.
1//=================================================================================================
33//=================================================================================================
34
35#ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
37
38
39//*************************************************************************************************
40// Includes
41//*************************************************************************************************
42
45#include <blaze/math/Aliases.h>
63#include <blaze/math/SIMD.h>
85#include <blaze/system/BLAS.h>
91#include <blaze/util/Assert.h>
92#include <blaze/util/Complex.h>
94#include <blaze/util/EnableIf.h>
97#include <blaze/util/mpl/If.h>
98#include <blaze/util/Types.h>
106
107
108namespace blaze {
109
110//=================================================================================================
111//
112// CLASS TDMATDVECMULTEXPR
113//
114//=================================================================================================
115
116//*************************************************************************************************
123template< typename MT // Type of the left-hand side dense matrix
124 , typename VT > // Type of the right-hand side dense vector
126 : public MatVecMultExpr< DenseVector< TDMatDVecMultExpr<MT,VT>, false > >
127 , private Computation
128{
129 private:
130 //**Type definitions****************************************************************************
137 //**********************************************************************************************
138
139 //**********************************************************************************************
141 static constexpr bool evaluateMatrix =
142 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
143 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
144 //**********************************************************************************************
145
146 //**********************************************************************************************
148 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
149 //**********************************************************************************************
150
151 //**********************************************************************************************
153
157 template< typename T1 >
158 static constexpr bool UseSMPAssign_v = ( evaluateMatrix || evaluateVector );
160 //**********************************************************************************************
161
162 //**********************************************************************************************
164
167 template< typename T1, typename T2, typename T3 >
168 static constexpr bool UseBlasKernel_v =
170 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
171 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
172 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
173 !IsDiagonal_v<T2> &&
174 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
175 IsBLASCompatible_v< ElementType_t<T1> > &&
176 IsBLASCompatible_v< ElementType_t<T2> > &&
177 IsBLASCompatible_v< ElementType_t<T3> > &&
178 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
179 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
181 //**********************************************************************************************
182
183 //**********************************************************************************************
185
189 template< typename T1, typename T2, typename T3 >
190 static constexpr bool UseVectorizedDefaultKernel_v =
191 ( useOptimizedKernels &&
192 !IsDiagonal_v<T2> &&
193 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
194 IsSIMDCombinable_v< ElementType_t<T1>
196 , ElementType_t<T3> > &&
197 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
198 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
200 //**********************************************************************************************
201
202 public:
203 //**Type definitions****************************************************************************
206
209
214 using ReturnType = const ElementType;
215 using CompositeType = const ResultType;
216
218 using LeftOperand = If_t< IsExpression_v<MT>, const MT, const MT& >;
219
221 using RightOperand = If_t< IsExpression_v<VT>, const VT, const VT& >;
222
225
228 //**********************************************************************************************
229
230 //**Compilation flags***************************************************************************
232 static constexpr bool simdEnabled =
233 ( !IsDiagonal_v<MT> &&
234 MT::simdEnabled && VT::simdEnabled &&
235 HasSIMDAdd_v<MET,VET> &&
236 HasSIMDMult_v<MET,VET> );
237
239 static constexpr bool smpAssignable =
240 ( !evaluateMatrix && MT::smpAssignable && !evaluateVector && VT::smpAssignable );
241 //**********************************************************************************************
242
243 //**SIMD properties*****************************************************************************
245 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
246 //**********************************************************************************************
247
248 //**Constructor*********************************************************************************
254 inline TDMatDVecMultExpr( const MT& mat, const VT& vec ) noexcept
255 : mat_( mat ) // Left-hand side dense matrix of the multiplication expression
256 , vec_( vec ) // Right-hand side dense vector of the multiplication expression
257 {
258 BLAZE_INTERNAL_ASSERT( mat_.columns() == vec_.size(), "Invalid matrix and vector sizes" );
259 }
260 //**********************************************************************************************
261
262 //**Subscript operator**************************************************************************
268 inline ReturnType operator[]( size_t index ) const {
269 BLAZE_INTERNAL_ASSERT( index < mat_.rows(), "Invalid vector access index" );
270
271 if( IsDiagonal_v<MT> )
272 {
273 return mat_(index,index) * vec_[index];
274 }
275 else if( IsLower_v<MT> && ( index + 8UL < mat_.rows() ) )
276 {
277 const size_t n( IsStrictlyLower_v<MT> ? index : index+1UL );
278 return subvector( row( mat_, index, unchecked ), 0UL, n, unchecked ) *
279 subvector( vec_, 0UL, n, unchecked );
280 }
281 else if( IsUpper_v<MT> && ( index > 8UL ) )
282 {
283 const size_t begin( IsStrictlyUpper_v<MT> ? index+1UL : index );
284 const size_t n ( mat_.columns() - begin );
285 return subvector( row( mat_, index, unchecked ), begin, n, unchecked ) *
287 }
288 else
289 {
290 return row( mat_, index, unchecked ) * vec_;
291 }
292 }
293 //**********************************************************************************************
294
295 //**At function*********************************************************************************
302 inline ReturnType at( size_t index ) const {
303 if( index >= mat_.rows() ) {
304 BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
305 }
306 return (*this)[index];
307 }
308 //**********************************************************************************************
309
310 //**Size function*******************************************************************************
315 inline size_t size() const noexcept {
316 return mat_.rows();
317 }
318 //**********************************************************************************************
319
320 //**Left operand access*************************************************************************
325 inline LeftOperand leftOperand() const noexcept {
326 return mat_;
327 }
328 //**********************************************************************************************
329
330 //**Right operand access************************************************************************
335 inline RightOperand rightOperand() const noexcept {
336 return vec_;
337 }
338 //**********************************************************************************************
339
340 //**********************************************************************************************
346 template< typename T >
347 inline bool canAlias( const T* alias ) const noexcept {
348 return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
349 }
350 //**********************************************************************************************
351
352 //**********************************************************************************************
358 template< typename T >
359 inline bool isAliased( const T* alias ) const noexcept {
360 return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
361 }
362 //**********************************************************************************************
363
364 //**********************************************************************************************
369 inline bool isAligned() const noexcept {
370 return mat_.isAligned() && vec_.isAligned();
371 }
372 //**********************************************************************************************
373
374 //**********************************************************************************************
379 inline bool canSMPAssign() const noexcept {
380 return ( !BLAZE_BLAS_MODE ||
383 ( IsComputation_v<MT> && !evaluateMatrix ) ||
384 ( mat_.rows() * mat_.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
385 ( size() > SMP_TDMATDVECMULT_THRESHOLD );
386 }
387 //**********************************************************************************************
388
389 private:
390 //**Member variables****************************************************************************
393 //**********************************************************************************************
394
395 //**Assignment to dense vectors*****************************************************************
408 template< typename VT1 > // Type of the target dense vector
409 friend inline void assign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
410 {
412
413 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
414
415 if( rhs.mat_.rows() == 0UL ) {
416 return;
417 }
418 else if( rhs.mat_.columns() == 0UL ||
419 ( IsStrictlyTriangular_v<MT> && rhs.mat_.columns() == 1UL ) ) {
420 reset( *lhs );
421 return;
422 }
423
424 LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
425 RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
426
427 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
428 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
429 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
430 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
431
432 TDMatDVecMultExpr::selectAssignKernel( *lhs, A, x );
433 }
435 //**********************************************************************************************
436
437 //**Assignment to dense vectors (kernel selection)**********************************************
448 template< typename VT1 // Type of the left-hand side target vector
449 , typename MT1 // Type of the left-hand side matrix operand
450 , typename VT2 > // Type of the right-hand side vector operand
451 static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x )
452 {
453 if( ( IsDiagonal_v<MT1> ) ||
454 ( IsComputation_v<MT> && !evaluateMatrix ) ||
455 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
456 selectSmallAssignKernel( y, A, x );
457 else
458 selectBlasAssignKernel( y, A, x );
459 }
461 //**********************************************************************************************
462
463 //**Default assignment to dense vectors*********************************************************
477 template< typename VT1 // Type of the left-hand side target vector
478 , typename MT1 // Type of the left-hand side matrix operand
479 , typename VT2 > // Type of the right-hand side vector operand
480 static inline void selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x )
481 {
482 const size_t M( A.rows() );
483 const size_t N( A.columns() );
484
485 if( IsStrictlyLower_v<MT1> ) {
486 reset( y[0] );
487 }
488
489 if( !IsUpper_v<MT1> )
490 {
491 for( size_t i=( IsStrictlyLower_v<MT1> ? 1UL : 0UL ); i<M; ++i ) {
492 y[i] = A(i,0UL) * x[0UL];
493 }
494 }
495
496 for( size_t j=( IsUpper_v<MT1> && !IsStrictlyUpper_v<MT1> ? 0UL : 1UL ); j<N; ++j )
497 {
498 if( IsDiagonal_v<MT1> )
499 {
500 y[j] = A(j,j) * x[j];
501 }
502 else
503 {
504 const size_t ibegin( ( IsLower_v<MT1> )
505 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
506 :( 0UL ) );
507 const size_t iend( ( IsUpper_v<MT1> )
508 ?( IsStrictlyUpper_v<MT1> ? j-1UL : j )
509 :( M ) );
510 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
511
512 const size_t inum( iend - ibegin );
513 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
514 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
515
516 for( size_t i=ibegin; i<ipos; i+=2UL ) {
517 y[i ] += A(i ,j) * x[j];
518 y[i+1UL] += A(i+1UL,j) * x[j];
519 }
520 if( ipos < iend ) {
521 y[ipos] += A(ipos,j) * x[j];
522 }
523 if( IsUpper_v<MT1> ) {
524 y[iend] = A(iend,j) * x[j];
525 }
526 }
527 }
528
529 if( IsStrictlyUpper_v<MT1> ) {
530 reset( y[M-1UL] );
531 }
532 }
534 //**********************************************************************************************
535
536 //**Default assignment to dense vectors (small matrices)****************************************
550 template< typename VT1 // Type of the left-hand side target vector
551 , typename MT1 // Type of the left-hand side matrix operand
552 , typename VT2 > // Type of the right-hand side vector operand
553 static inline auto selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
554 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
555 {
556 selectDefaultAssignKernel( y, A, x );
557 }
559 //**********************************************************************************************
560
561 //**Vectorized default assignment to dense vectors (small matrices)*****************************
575 template< typename VT1 // Type of the left-hand side target vector
576 , typename MT1 // Type of the left-hand side matrix operand
577 , typename VT2 > // Type of the right-hand side vector operand
578 static inline auto selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
579 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
580 {
581 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
582
583 const size_t M( A.rows() );
584 const size_t N( A.columns() );
585
586 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
587 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
588
589 size_t i( 0UL );
590
591 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
592 {
593 const size_t jbegin( ( IsUpper_v<MT1> )
594 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
595 :( 0UL ) );
596 const size_t jend( ( IsLower_v<MT1> )
597 ?( min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
598 :( N ) );
599 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
600
601 SIMDType x1( set( x[jbegin] ) );
602 SIMDType xmm1( A.load(i ,jbegin) * x1 );
603 SIMDType xmm2( A.load(i+SIMDSIZE ,jbegin) * x1 );
604 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jbegin) * x1 );
605 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jbegin) * x1 );
606 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,jbegin) * x1 );
607 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,jbegin) * x1 );
608 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,jbegin) * x1 );
609 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,jbegin) * x1 );
610
611 for( size_t j=jbegin+1UL; j<jend; ++j ) {
612 x1 = set( x[j] );
613 xmm1 += A.load(i ,j) * x1;
614 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
615 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
616 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
617 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
618 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
619 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
620 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
621 }
622
623 y.store( i , xmm1 );
624 y.store( i+SIMDSIZE , xmm2 );
625 y.store( i+SIMDSIZE*2UL, xmm3 );
626 y.store( i+SIMDSIZE*3UL, xmm4 );
627 y.store( i+SIMDSIZE*4UL, xmm5 );
628 y.store( i+SIMDSIZE*5UL, xmm6 );
629 y.store( i+SIMDSIZE*6UL, xmm7 );
630 y.store( i+SIMDSIZE*7UL, xmm8 );
631 }
632
633 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
634 {
635 const size_t jbegin( ( IsUpper_v<MT1> )
636 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
637 :( 0UL ) );
638 const size_t jend( ( IsLower_v<MT1> )
639 ?( min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
640 :( N ) );
641 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
642
643 SIMDType x1( set( x[jbegin] ) );
644 SIMDType xmm1( A.load(i ,jbegin) * x1 );
645 SIMDType xmm2( A.load(i+SIMDSIZE ,jbegin) * x1 );
646 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jbegin) * x1 );
647 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jbegin) * x1 );
648
649 for( size_t j=jbegin+1UL; j<jend; ++j ) {
650 x1 = set( x[j] );
651 xmm1 += A.load(i ,j) * x1;
652 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
653 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
654 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
655 }
656
657 y.store( i , xmm1 );
658 y.store( i+SIMDSIZE , xmm2 );
659 y.store( i+SIMDSIZE*2UL, xmm3 );
660 y.store( i+SIMDSIZE*3UL, xmm4 );
661 }
662
663 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
664 {
665 const size_t jbegin( ( IsUpper_v<MT1> )
666 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
667 :( 0UL ) );
668 const size_t jend( ( IsLower_v<MT1> )
669 ?( min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
670 :( N ) );
671 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
672
673 SIMDType x1( set( x[jbegin] ) );
674 SIMDType xmm1( A.load(i ,jbegin) * x1 );
675 SIMDType xmm2( A.load(i+SIMDSIZE ,jbegin) * x1 );
676 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jbegin) * x1 );
677
678 for( size_t j=jbegin+1UL; j<jend; ++j ) {
679 x1 = set( x[j] );
680 xmm1 += A.load(i ,j) * x1;
681 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
682 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
683 }
684
685 y.store( i , xmm1 );
686 y.store( i+SIMDSIZE , xmm2 );
687 y.store( i+SIMDSIZE*2UL, xmm3 );
688 }
689
690 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
691 {
692 const size_t jbegin( ( IsUpper_v<MT1> )
693 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
694 :( 0UL ) );
695 const size_t jend( ( IsLower_v<MT1> )
696 ?( min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
697 :( N ) );
698 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
699
700 SIMDType x1( set( x[jbegin] ) );
701 SIMDType xmm1( A.load(i ,jbegin) * x1 );
702 SIMDType xmm2( A.load(i+SIMDSIZE,jbegin) * x1 );
703
704 for( size_t j=jbegin+1UL; j<jend; ++j ) {
705 x1 = set( x[j] );
706 xmm1 += A.load(i ,j) * x1;
707 xmm2 += A.load(i+SIMDSIZE,j) * x1;
708 }
709
710 y.store( i , xmm1 );
711 y.store( i+SIMDSIZE, xmm2 );
712 }
713
714 for( ; i<ipos; i+=SIMDSIZE )
715 {
716 const size_t jbegin( ( IsUpper_v<MT1> )
717 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
718 :( 0UL ) );
719 const size_t jend( ( IsLower_v<MT1> )
720 ?( min( i+SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
721 :( N ) );
722 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
723
724 SIMDType xmm1( A.load(i,jbegin) * set( x[jbegin] ) );
725
726 for( size_t j=jbegin+1UL; j<jend; ++j ) {
727 xmm1 += A.load(i,j) * set( x[j] );
728 }
729
730 y.store( i, xmm1 );
731 }
732
733 for( ; remainder && i<M; ++i )
734 {
735 const size_t jbegin( ( IsUpper_v<MT1> )?( i ):( 0UL ) );
736 const size_t jend( ( IsLower_v<MT1> )?( min( i+1UL, N ) ):( N ) );
737 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
738
739 ElementType value( A(i,jbegin) * x[jbegin] );
740
741 for( size_t j=jbegin+1UL; j<jend; ++j ) {
742 value += A(i,j) * x[j];
743 }
744
745 y[i] = value;
746 }
747 }
749 //**********************************************************************************************
750
751 //**Default assignment to dense vectors (large matrices)****************************************
765 template< typename VT1 // Type of the left-hand side target vector
766 , typename MT1 // Type of the left-hand side matrix operand
767 , typename VT2 > // Type of the right-hand side vector operand
768 static inline auto selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
769 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
770 {
771 selectDefaultAssignKernel( y, A, x );
772 }
774 //**********************************************************************************************
775
776 //**Vectorized default assignment to dense vectors (large matrices)*****************************
790 template< typename VT1 // Type of the left-hand side target vector
791 , typename MT1 // Type of the left-hand side matrix operand
792 , typename VT2 > // Type of the right-hand side vector operand
793 static inline auto selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
794 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
795 {
796 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
797
798 const size_t M( A.rows() );
799 const size_t N( A.columns() );
800
801 const size_t iblock( 32768UL / sizeof( ElementType ) );
802 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
803
804 BLAZE_INTERNAL_ASSERT( ( iblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
805
806 reset( y );
807
808 for( size_t ii=0U; ii<M; ii+=iblock ) {
809 for( size_t jj=0UL; jj<N; jj+=jblock )
810 {
811 const size_t jend( min( jj+jblock, N ) );
812 const size_t itmp( min( ii+iblock, M ) );
813 const size_t iend( ( IsUpper_v<MT1> )
814 ?( min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
815 :( itmp ) );
816
817 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
818 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
819
820 size_t i( ( IsLower_v<MT1> )
821 ?( max( ii, prevMultiple( ( IsStrictlyLower_v<MT1> ? jj+1UL : jj ), SIMDSIZE ) ) )
822 :( ii ) );
823
824 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
825 {
826 SIMDType x1( set( x[jj] ) );
827 SIMDType xmm1( A.load(i ,jj) * x1 );
828 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
829 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
830 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jj) * x1 );
831 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,jj) * x1 );
832 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,jj) * x1 );
833 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,jj) * x1 );
834 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,jj) * x1 );
835
836 for( size_t j=jj+1UL; j<jend; ++j ) {
837 x1 = set( x[j] );
838 xmm1 += A.load(i ,j) * x1;
839 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
840 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
841 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
842 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
843 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
844 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
845 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
846 }
847
848 y.store( i , y.load(i ) + xmm1 );
849 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
850 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
851 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
852 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5 );
853 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6 );
854 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7 );
855 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8 );
856 }
857
858 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
859 {
860 SIMDType x1( set( x[jj] ) );
861 SIMDType xmm1( A.load(i ,jj) * x1 );
862 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
863 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
864 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jj) * x1 );
865
866 for( size_t j=jj+1UL; j<jend; ++j ) {
867 x1 = set( x[j] );
868 xmm1 += A.load(i ,j) * x1;
869 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
870 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
871 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
872 }
873
874 y.store( i , y.load(i ) + xmm1 );
875 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
876 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
877 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
878 }
879
880 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
881 {
882 SIMDType x1( set( x[jj] ) );
883 SIMDType xmm1( A.load(i ,jj) * x1 );
884 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
885 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
886
887 for( size_t j=jj+1UL; j<jend; ++j ) {
888 x1 = set( x[j] );
889 xmm1 += A.load(i ,j) * x1;
890 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
891 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
892 }
893
894 y.store( i , y.load(i ) + xmm1 );
895 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
896 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
897 }
898
899 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
900 {
901 SIMDType x1( set( x[jj] ) );
902 SIMDType xmm1( A.load(i ,jj) * x1 );
903 SIMDType xmm2( A.load(i+SIMDSIZE,jj) * x1 );
904
905 for( size_t j=jj+1UL; j<jend; ++j ) {
906 x1 = set( x[j] );
907 xmm1 += A.load(i ,j) * x1;
908 xmm2 += A.load(i+SIMDSIZE,j) * x1;
909 }
910
911 y.store( i , y.load(i ) + xmm1 );
912 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2 );
913 }
914
915 for( ; i<ipos; i+=SIMDSIZE )
916 {
917 SIMDType xmm1( A.load(i,jj) * set( x[jj] ) );
918
919 for( size_t j=jj+1UL; j<jend; ++j ) {
920 xmm1 += A.load(i,j) * set( x[j] );
921 }
922
923 y.store( i, y.load(i) + xmm1 );
924 }
925
926 for( ; remainder && i<iend; ++i )
927 {
928 ElementType value( A(i,jj) * x[jj] );
929
930 for( size_t j=jj+1UL; j<jend; ++j ) {
931 value += A(i,j) * x[j];
932 }
933
934 y[i] += value;
935 }
936 }
937 }
938 }
940 //**********************************************************************************************
941
942 //**BLAS-based assignment to dense vectors (default)********************************************
956 template< typename VT1 // Type of the left-hand side target vector
957 , typename MT1 // Type of the left-hand side matrix operand
958 , typename VT2 > // Type of the right-hand side vector operand
959 static inline auto selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
960 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
961 {
962 selectLargeAssignKernel( y, A, x );
963 }
965 //**********************************************************************************************
966
967 //**BLAS-based assignment to dense vectors******************************************************
968#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
982 template< typename VT1 // Type of the left-hand side target vector
983 , typename MT1 // Type of the left-hand side matrix operand
984 , typename VT2 > // Type of the right-hand side vector operand
985 static inline auto selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
986 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
987 {
988 using ET = ElementType_t<VT1>;
989
990 if( IsTriangular_v<MT1> ) {
991 assign( y, x );
992 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
993 }
994 else {
995 gemv( y, A, x, ET(1), ET(0) );
996 }
997 }
999#endif
1000 //**********************************************************************************************
1001
1002 //**Assignment to sparse vectors****************************************************************
1015 template< typename VT1 > // Type of the target sparse vector
1016 friend inline void assign( SparseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
1017 {
1019
1023
1024 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
1025
1026 const ResultType tmp( serial( rhs ) );
1027 assign( *lhs, tmp );
1028 }
1030 //**********************************************************************************************
1031
1032 //**Addition assignment to dense vectors********************************************************
1045 template< typename VT1 > // Type of the target dense vector
1046 friend inline void addAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
1047 {
1049
1050 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
1051
1052 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
1053 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
1054 return;
1055 }
1056
1057 LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1058 RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1059
1060 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1061 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1062 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1063 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
1064
1065 TDMatDVecMultExpr::selectAddAssignKernel( *lhs, A, x );
1066 }
1068 //**********************************************************************************************
1069
1070 //**Addition assignment to dense vectors (kernel selection)*************************************
1081 template< typename VT1 // Type of the left-hand side target vector
1082 , typename MT1 // Type of the left-hand side matrix operand
1083 , typename VT2 > // Type of the right-hand side vector operand
1084 static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1085 {
1086 if( ( IsDiagonal_v<MT1> ) ||
1087 ( IsComputation_v<MT> && !evaluateMatrix ) ||
1088 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1089 selectSmallAddAssignKernel( y, A, x );
1090 else
1091 selectBlasAddAssignKernel( y, A, x );
1092 }
1094 //**********************************************************************************************
1095
1096 //**Default addition assignment to dense vectors************************************************
1110 template< typename VT1 // Type of the left-hand side target vector
1111 , typename MT1 // Type of the left-hand side matrix operand
1112 , typename VT2 > // Type of the right-hand side vector operand
1113 static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1114 {
1115 const size_t M( A.rows() );
1116 const size_t N( A.columns() );
1117
1118 for( size_t j=0UL; j<N; ++j )
1119 {
1120 if( IsDiagonal_v<MT1> )
1121 {
1122 y[j] += A(j,j) * x[j];
1123 }
1124 else
1125 {
1126 const size_t ibegin( ( IsLower_v<MT1> )
1127 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1128 :( 0UL ) );
1129 const size_t iend( ( IsUpper_v<MT1> )
1130 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1131 :( M ) );
1132 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1133
1134 const size_t inum( iend - ibegin );
1135 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
1136 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
1137
1138 for( size_t i=ibegin; i<ipos; i+=2UL ) {
1139 y[i ] += A(i ,j) * x[j];
1140 y[i+1UL] += A(i+1UL,j) * x[j];
1141 }
1142 if( ipos < iend ) {
1143 y[ipos] += A(ipos,j) * x[j];
1144 }
1145 }
1146 }
1147 }
1149 //**********************************************************************************************
1150
1151 //**Default addition assignment to dense vectors (small matrices)*******************************
1165 template< typename VT1 // Type of the left-hand side target vector
1166 , typename MT1 // Type of the left-hand side matrix operand
1167 , typename VT2 > // Type of the right-hand side vector operand
1168 static inline auto selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1169 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1170 {
1171 selectDefaultAddAssignKernel( y, A, x );
1172 }
1174 //**********************************************************************************************
1175
1176 //**Vectorized default addition assignment to dense vectors (small matrices)********************
1190 template< typename VT1 // Type of the left-hand side target vector
1191 , typename MT1 // Type of the left-hand side matrix operand
1192 , typename VT2 > // Type of the right-hand side vector operand
1193 static inline auto selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1194 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1195 {
1196 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
1197
1198 const size_t M( A.rows() );
1199 const size_t N( A.columns() );
1200
1201 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
1202 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
1203
1204 size_t i( 0UL );
1205
1206 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1207 {
1208 const size_t jbegin( ( IsUpper_v<MT1> )
1209 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1210 :( 0UL ) );
1211 const size_t jend( ( IsLower_v<MT1> )
1212 ?( min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1213 :( N ) );
1214 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1215
1216 SIMDType xmm1( y.load(i ) );
1217 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1218 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1219 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1220 SIMDType xmm5( y.load(i+SIMDSIZE*4UL) );
1221 SIMDType xmm6( y.load(i+SIMDSIZE*5UL) );
1222 SIMDType xmm7( y.load(i+SIMDSIZE*6UL) );
1223 SIMDType xmm8( y.load(i+SIMDSIZE*7UL) );
1224
1225 for( size_t j=jbegin; j<jend; ++j ) {
1226 const SIMDType x1( set( x[j] ) );
1227 xmm1 += A.load(i ,j) * x1;
1228 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1229 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1230 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1231 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
1232 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
1233 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
1234 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
1235 }
1236
1237 y.store( i , xmm1 );
1238 y.store( i+SIMDSIZE , xmm2 );
1239 y.store( i+SIMDSIZE*2UL, xmm3 );
1240 y.store( i+SIMDSIZE*3UL, xmm4 );
1241 y.store( i+SIMDSIZE*4UL, xmm5 );
1242 y.store( i+SIMDSIZE*5UL, xmm6 );
1243 y.store( i+SIMDSIZE*6UL, xmm7 );
1244 y.store( i+SIMDSIZE*7UL, xmm8 );
1245 }
1246
1247 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1248 {
1249 const size_t jbegin( ( IsUpper_v<MT1> )
1250 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1251 :( 0UL ) );
1252 const size_t jend( ( IsLower_v<MT1> )
1253 ?( min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1254 :( N ) );
1255 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1256
1257 SIMDType xmm1( y.load(i ) );
1258 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1259 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1260 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1261
1262 for( size_t j=jbegin; j<jend; ++j ) {
1263 const SIMDType x1( set( x[j] ) );
1264 xmm1 += A.load(i ,j) * x1;
1265 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1266 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1267 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1268 }
1269
1270 y.store( i , xmm1 );
1271 y.store( i+SIMDSIZE , xmm2 );
1272 y.store( i+SIMDSIZE*2UL, xmm3 );
1273 y.store( i+SIMDSIZE*3UL, xmm4 );
1274 }
1275
1276 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1277 {
1278 const size_t jbegin( ( IsUpper_v<MT1> )
1279 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1280 :( 0UL ) );
1281 const size_t jend( ( IsLower_v<MT1> )
1282 ?( min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1283 :( N ) );
1284 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1285
1286 SIMDType xmm1( y.load(i ) );
1287 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1288 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1289
1290 for( size_t j=jbegin; j<jend; ++j ) {
1291 const SIMDType x1( set( x[j] ) );
1292 xmm1 += A.load(i ,j) * x1;
1293 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1294 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1295 }
1296
1297 y.store( i , xmm1 );
1298 y.store( i+SIMDSIZE , xmm2 );
1299 y.store( i+SIMDSIZE*2UL, xmm3 );
1300 }
1301
1302 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1303 {
1304 const size_t jbegin( ( IsUpper_v<MT1> )
1305 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1306 :( 0UL ) );
1307 const size_t jend( ( IsLower_v<MT1> )
1308 ?( min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1309 :( N ) );
1310 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1311
1312 SIMDType xmm1( y.load(i ) );
1313 SIMDType xmm2( y.load(i+SIMDSIZE) );
1314
1315 for( size_t j=jbegin; j<jend; ++j ) {
1316 const SIMDType x1( set( x[j] ) );
1317 xmm1 += A.load(i ,j) * x1;
1318 xmm2 += A.load(i+SIMDSIZE,j) * x1;
1319 }
1320
1321 y.store( i , xmm1 );
1322 y.store( i+SIMDSIZE, xmm2 );
1323 }
1324
1325 for( ; i<ipos; i+=SIMDSIZE )
1326 {
1327 const size_t jbegin( ( IsUpper_v<MT1> )
1328 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1329 :( 0UL ) );
1330 const size_t jend( ( IsLower_v<MT1> )
1331 ?( min( i+SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1332 :( N ) );
1333 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1334
1335 SIMDType xmm1( y.load(i) );
1336
1337 for( size_t j=jbegin; j<jend; ++j ) {
1338 xmm1 += A.load(i,j) * set( x[j] );
1339 }
1340
1341 y.store( i, xmm1 );
1342 }
1343
1344 for( ; remainder && i<M; ++i )
1345 {
1346 const size_t jbegin( ( IsUpper_v<MT1> )?( i ):( 0UL ) );
1347 const size_t jend( ( IsLower_v<MT1> )?( min( i+1UL, N ) ):( N ) );
1348 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1349
1350 ElementType value( A(i,jbegin) * x[jbegin] );
1351
1352 for( size_t j=jbegin+1UL; j<jend; ++j ) {
1353 value += A(i,j) * x[j];
1354 }
1355
1356 y[i] += value;
1357 }
1358 }
1360 //**********************************************************************************************
1361
1362 //**Default addition assignment to dense vectors (large matrices)*******************************
1376 template< typename VT1 // Type of the left-hand side target vector
1377 , typename MT1 // Type of the left-hand side matrix operand
1378 , typename VT2 > // Type of the right-hand side vector operand
1379 static inline auto selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1380 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1381 {
1382 selectDefaultAddAssignKernel( y, A, x );
1383 }
1385 //**********************************************************************************************
1386
1387 //**Vectorized default addition assignment to dense vectors (large matrices)********************
1401 template< typename VT1 // Type of the left-hand side target vector
1402 , typename MT1 // Type of the left-hand side matrix operand
1403 , typename VT2 > // Type of the right-hand side vector operand
1404 static inline auto selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1405 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1406 {
1407 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
1408
1409 const size_t M( A.rows() );
1410 const size_t N( A.columns() );
1411
1412 const size_t iblock( 32768UL / sizeof( ElementType ) );
1413 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1414
1415 BLAZE_INTERNAL_ASSERT( ( iblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1416
1417 for( size_t ii=0U; ii<M; ii+=iblock ) {
1418 for( size_t jj=0UL; jj<N; jj+=jblock )
1419 {
1420 const size_t jend( min( jj+jblock, N ) );
1421 const size_t itmp( min( ii+iblock, M ) );
1422 const size_t iend( ( IsUpper_v<MT1> )
1423 ?( min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
1424 :( itmp ) );
1425
1426 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
1427 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
1428
1429 size_t i( ( IsLower_v<MT1> )
1430 ?( max( ii, prevMultiple( ( IsStrictlyLower_v<MT1> ? jj+1UL : jj ), SIMDSIZE ) ) )
1431 :( ii ) );
1432
1433 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1434 {
1435 SIMDType x1( set( x[jj] ) );
1436 SIMDType xmm1( A.load(i ,jj) * x1 );
1437 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
1438 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
1439 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jj) * x1 );
1440 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,jj) * x1 );
1441 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,jj) * x1 );
1442 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,jj) * x1 );
1443 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,jj) * x1 );
1444
1445 for( size_t j=jj+1UL; j<jend; ++j ) {
1446 x1 = set( x[j] );
1447 xmm1 += A.load(i ,j) * x1;
1448 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1449 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1450 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1451 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
1452 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
1453 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
1454 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
1455 }
1456
1457 y.store( i , y.load(i ) + xmm1 );
1458 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1459 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1460 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
1461 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5 );
1462 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6 );
1463 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7 );
1464 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8 );
1465 }
1466
1467 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1468 {
1469 SIMDType x1( set( x[jj] ) );
1470 SIMDType xmm1( A.load(i ,jj) * x1 );
1471 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
1472 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
1473 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jj) * x1 );
1474
1475 for( size_t j=jj+1UL; j<jend; ++j ) {
1476 x1 = set( x[j] );
1477 xmm1 += A.load(i ,j) * x1;
1478 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1479 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1480 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1481 }
1482
1483 y.store( i , y.load(i ) + xmm1 );
1484 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1485 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1486 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
1487 }
1488
1489 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1490 {
1491 SIMDType x1( set( x[jj] ) );
1492 SIMDType xmm1( A.load(i ,jj) * x1 );
1493 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
1494 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
1495
1496 for( size_t j=jj+1UL; j<jend; ++j ) {
1497 x1 = set( x[j] );
1498 xmm1 += A.load(i ,j) * x1;
1499 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1500 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1501 }
1502
1503 y.store( i , y.load(i ) + xmm1 );
1504 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1505 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1506 }
1507
1508 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1509 {
1510 SIMDType x1( set( x[jj] ) );
1511 SIMDType xmm1( A.load(i ,jj) * x1 );
1512 SIMDType xmm2( A.load(i+SIMDSIZE,jj) * x1 );
1513
1514 for( size_t j=jj+1UL; j<jend; ++j ) {
1515 x1 = set( x[j] );
1516 xmm1 += A.load(i ,j) * x1;
1517 xmm2 += A.load(i+SIMDSIZE,j) * x1;
1518 }
1519
1520 y.store( i , y.load(i ) + xmm1 );
1521 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2 );
1522 }
1523
1524 for( ; i<ipos; i+=SIMDSIZE )
1525 {
1526 SIMDType xmm1( A.load(i,jj) * set( x[jj] ) );
1527
1528 for( size_t j=jj+1UL; j<jend; ++j ) {
1529 xmm1 += A.load(i,j) * set( x[j] );
1530 }
1531
1532 y.store( i, y.load(i) + xmm1 );
1533 }
1534
1535 for( ; remainder && i<iend; ++i )
1536 {
1537 ElementType value( A(i,jj) * x[jj] );
1538
1539 for( size_t j=jj+1UL; j<jend; ++j ) {
1540 value += A(i,j) * x[j];
1541 }
1542
1543 y[i] += value;
1544 }
1545 }
1546 }
1547 }
1549 //**********************************************************************************************
1550
1551 //**BLAS-based addition assignment to dense vectors (default)***********************************
1565 template< typename VT1 // Type of the left-hand side target vector
1566 , typename MT1 // Type of the left-hand side matrix operand
1567 , typename VT2 > // Type of the right-hand side vector operand
1568 static inline auto selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1569 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1570 {
1571 selectLargeAddAssignKernel( y, A, x );
1572 }
1574 //**********************************************************************************************
1575
1576 //**BLAS-based addition assignment to dense vectors*********************************************
1577#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1591 template< typename VT1 // Type of the left-hand side target vector
1592 , typename MT1 // Type of the left-hand side matrix operand
1593 , typename VT2 > // Type of the right-hand side vector operand
1594 static inline auto selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1595 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1596 {
1597 using ET = ElementType_t<VT1>;
1598
1599 if( IsTriangular_v<MT1> ) {
1600 ResultType_t<VT1> tmp( serial( x ) );
1601 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1602 addAssign( y, tmp );
1603 }
1604 else {
1605 gemv( y, A, x, ET(1), ET(1) );
1606 }
1607 }
1609#endif
1610 //**********************************************************************************************
1611
1612 //**Addition assignment to sparse vectors*******************************************************
1613 // No special implementation for the addition assignment to sparse vectors.
1614 //**********************************************************************************************
1615
1616 //**Subtraction assignment to dense vectors*****************************************************
1629 template< typename VT1 > // Type of the target dense vector
1630 friend inline void subAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
1631 {
1633
1634 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
1635
1636 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
1637 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
1638 return;
1639 }
1640
1641 LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1642 RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1643
1644 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1645 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1646 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1647 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
1648
1649 TDMatDVecMultExpr::selectSubAssignKernel( *lhs, A, x );
1650 }
1652 //**********************************************************************************************
1653
1654 //**Subtraction assignment to dense vectors (kernel selection)**********************************
1665 template< typename VT1 // Type of the left-hand side target vector
1666 , typename MT1 // Type of the left-hand side matrix operand
1667 , typename VT2 > // Type of the right-hand side vector operand
1668 static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1669 {
1670 if( ( IsDiagonal_v<MT1> ) ||
1671 ( IsComputation_v<MT> && !evaluateMatrix ) ||
1672 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1673 selectSmallSubAssignKernel( y, A, x );
1674 else
1675 selectBlasSubAssignKernel( y, A, x );
1676 }
1678 //**********************************************************************************************
1679
1680 //**Default subtraction assignment to dense vectors*********************************************
1694 template< typename VT1 // Type of the left-hand side target vector
1695 , typename MT1 // Type of the left-hand side matrix operand
1696 , typename VT2 > // Type of the right-hand side vector operand
1697 static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1698 {
1699 const size_t M( A.rows() );
1700 const size_t N( A.columns() );
1701
1702 for( size_t j=0UL; j<N; ++j )
1703 {
1704 if( IsDiagonal_v<MT1> )
1705 {
1706 y[j] -= A(j,j) * x[j];
1707 }
1708 else
1709 {
1710 const size_t ibegin( ( IsLower_v<MT1> )
1711 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1712 :( 0UL ) );
1713 const size_t iend( ( IsUpper_v<MT1> )
1714 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1715 :( M ) );
1716 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1717
1718 const size_t inum( iend - ibegin );
1719 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
1720 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
1721
1722 for( size_t i=ibegin; i<ipos; i+=2UL ) {
1723 y[i ] -= A(i ,j) * x[j];
1724 y[i+1UL] -= A(i+1UL,j) * x[j];
1725 }
1726 if( ipos < iend ) {
1727 y[ipos] -= A(ipos,j) * x[j];
1728 }
1729 }
1730 }
1731 }
1733 //**********************************************************************************************
1734
1735 //**Default subtraction assignment to dense vectors (small matrices)****************************
1749 template< typename VT1 // Type of the left-hand side target vector
1750 , typename MT1 // Type of the left-hand side matrix operand
1751 , typename VT2 > // Type of the right-hand side vector operand
1752 static inline auto selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1753 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1754 {
1755 selectDefaultSubAssignKernel( y, A, x );
1756 }
1758 //**********************************************************************************************
1759
1760 //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1775 template< typename VT1 // Type of the left-hand side target vector
1776 , typename MT1 // Type of the left-hand side matrix operand
1777 , typename VT2 > // Type of the right-hand side vector operand
1778 static inline auto selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1779 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1780 {
1781 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
1782
1783 const size_t M( A.rows() );
1784 const size_t N( A.columns() );
1785
1786 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
1787 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
1788
1789 size_t i( 0UL );
1790
1791 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1792 {
1793 const size_t jbegin( ( IsUpper_v<MT1> )
1794 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1795 :( 0UL ) );
1796 const size_t jend( ( IsLower_v<MT1> )
1797 ?( min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1798 :( N ) );
1799 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1800
1801 SIMDType xmm1( y.load(i ) );
1802 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1803 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1804 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1805 SIMDType xmm5( y.load(i+SIMDSIZE*4UL) );
1806 SIMDType xmm6( y.load(i+SIMDSIZE*5UL) );
1807 SIMDType xmm7( y.load(i+SIMDSIZE*6UL) );
1808 SIMDType xmm8( y.load(i+SIMDSIZE*7UL) );
1809
1810 for( size_t j=jbegin; j<jend; ++j ) {
1811 const SIMDType x1( set( x[j] ) );
1812 xmm1 -= A.load(i ,j) * x1;
1813 xmm2 -= A.load(i+SIMDSIZE ,j) * x1;
1814 xmm3 -= A.load(i+SIMDSIZE*2UL,j) * x1;
1815 xmm4 -= A.load(i+SIMDSIZE*3UL,j) * x1;
1816 xmm5 -= A.load(i+SIMDSIZE*4UL,j) * x1;
1817 xmm6 -= A.load(i+SIMDSIZE*5UL,j) * x1;
1818 xmm7 -= A.load(i+SIMDSIZE*6UL,j) * x1;
1819 xmm8 -= A.load(i+SIMDSIZE*7UL,j) * x1;
1820 }
1821
1822 y.store( i , xmm1 );
1823 y.store( i+SIMDSIZE , xmm2 );
1824 y.store( i+SIMDSIZE*2UL, xmm3 );
1825 y.store( i+SIMDSIZE*3UL, xmm4 );
1826 y.store( i+SIMDSIZE*4UL, xmm5 );
1827 y.store( i+SIMDSIZE*5UL, xmm6 );
1828 y.store( i+SIMDSIZE*6UL, xmm7 );
1829 y.store( i+SIMDSIZE*7UL, xmm8 );
1830 }
1831
1832 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1833 {
1834 const size_t jbegin( ( IsUpper_v<MT1> )
1835 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1836 :( 0UL ) );
1837 const size_t jend( ( IsLower_v<MT1> )
1838 ?( min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1839 :( N ) );
1840 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1841
1842 SIMDType xmm1( y.load(i ) );
1843 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1844 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1845 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1846
1847 for( size_t j=jbegin; j<jend; ++j ) {
1848 const SIMDType x1( set( x[j] ) );
1849 xmm1 -= A.load(i ,j) * x1;
1850 xmm2 -= A.load(i+SIMDSIZE ,j) * x1;
1851 xmm3 -= A.load(i+SIMDSIZE*2UL,j) * x1;
1852 xmm4 -= A.load(i+SIMDSIZE*3UL,j) * x1;
1853 }
1854
1855 y.store( i , xmm1 );
1856 y.store( i+SIMDSIZE , xmm2 );
1857 y.store( i+SIMDSIZE*2UL, xmm3 );
1858 y.store( i+SIMDSIZE*3UL, xmm4 );
1859 }
1860
1861 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1862 {
1863 const size_t jbegin( ( IsUpper_v<MT1> )
1864 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1865 :( 0UL ) );
1866 const size_t jend( ( IsLower_v<MT1> )
1867 ?( min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1868 :( N ) );
1869 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1870
1871 SIMDType xmm1( y.load(i ) );
1872 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1873 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1874
1875 for( size_t j=jbegin; j<jend; ++j ) {
1876 const SIMDType x1( set( x[j] ) );
1877 xmm1 -= A.load(i ,j) * x1;
1878 xmm2 -= A.load(i+SIMDSIZE ,j) * x1;
1879 xmm3 -= A.load(i+SIMDSIZE*2UL,j) * x1;
1880 }
1881
1882 y.store( i , xmm1 );
1883 y.store( i+SIMDSIZE , xmm2 );
1884 y.store( i+SIMDSIZE*2UL, xmm3 );
1885 }
1886
1887 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1888 {
1889 const size_t jbegin( ( IsUpper_v<MT1> )
1890 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1891 :( 0UL ) );
1892 const size_t jend( ( IsLower_v<MT1> )
1893 ?( min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1894 :( N ) );
1895 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1896
1897 SIMDType xmm1( y.load(i ) );
1898 SIMDType xmm2( y.load(i+SIMDSIZE) );
1899
1900 for( size_t j=jbegin; j<jend; ++j ) {
1901 const SIMDType x1( set( x[j] ) );
1902 xmm1 -= A.load(i ,j) * x1;
1903 xmm2 -= A.load(i+SIMDSIZE,j) * x1;
1904 }
1905
1906 y.store( i , xmm1 );
1907 y.store( i+SIMDSIZE, xmm2 );
1908 }
1909
1910 for( ; i<ipos; i+=SIMDSIZE )
1911 {
1912 const size_t jbegin( ( IsUpper_v<MT1> )
1913 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1914 :( 0UL ) );
1915 const size_t jend( ( IsLower_v<MT1> )
1916 ?( min( i+SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1917 :( N ) );
1918 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1919
1920 SIMDType xmm1( y.load(i) );
1921
1922 for( size_t j=jbegin; j<jend; ++j ) {
1923 xmm1 -= A.load(i,j) * set( x[j] );
1924 }
1925
1926 y.store( i, xmm1 );
1927 }
1928
1929 for( ; remainder && i<M; ++i )
1930 {
1931 const size_t jbegin( ( IsUpper_v<MT1> )?( i ):( 0UL ) );
1932 const size_t jend( ( IsLower_v<MT1> )?( min( i+1UL, N ) ):( N ) );
1933 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1934
1935 ElementType value( A(i,jbegin) * x[jbegin] );
1936
1937 for( size_t j=jbegin+1UL; j<jend; ++j ) {
1938 value += A(i,j) * x[j];
1939 }
1940
1941 y[i] -= value;
1942 }
1943 }
1945 //**********************************************************************************************
1946
1947 //**Default subtraction assignment to dense vectors (large matrices)****************************
1961 template< typename VT1 // Type of the left-hand side target vector
1962 , typename MT1 // Type of the left-hand side matrix operand
1963 , typename VT2 > // Type of the right-hand side vector operand
1964 static inline auto selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1965 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1966 {
1967 selectDefaultSubAssignKernel( y, A, x );
1968 }
1970 //**********************************************************************************************
1971
1972 //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
1987 template< typename VT1 // Type of the left-hand side target vector
1988 , typename MT1 // Type of the left-hand side matrix operand
1989 , typename VT2 > // Type of the right-hand side vector operand
1990 static inline auto selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1991 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1992 {
1993 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
1994
1995 const size_t M( A.rows() );
1996 const size_t N( A.columns() );
1997
1998 const size_t iblock( 32768UL / sizeof( ElementType ) );
1999 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
2000
2001 BLAZE_INTERNAL_ASSERT( ( iblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
2002
2003 for( size_t ii=0U; ii<M; ii+=iblock ) {
2004 for( size_t jj=0UL; jj<N; jj+=jblock )
2005 {
2006 const size_t jend( min( jj+jblock, N ) );
2007 const size_t itmp( min( ii+iblock, M ) );
2008 const size_t iend( ( IsUpper_v<MT1> )
2009 ?( min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
2010 :( itmp ) );
2011
2012 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
2013 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
2014
2015 size_t i( ( IsLower_v<MT1> )
2016 ?( max( ii, prevMultiple( ( IsStrictlyLower_v<MT1> ? jj+1UL : jj ), SIMDSIZE ) ) )
2017 :( ii ) );
2018
2019 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
2020 {
2021 SIMDType x1( set( x[jj] ) );
2022 SIMDType xmm1( A.load(i ,jj) * x1 );
2023 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
2024 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
2025 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jj) * x1 );
2026 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,jj) * x1 );
2027 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,jj) * x1 );
2028 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,jj) * x1 );
2029 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,jj) * x1 );
2030
2031 for( size_t j=jj+1UL; j<jend; ++j ) {
2032 x1 = set( x[j] );
2033 xmm1 += A.load(i ,j) * x1;
2034 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2035 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2036 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
2037 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
2038 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
2039 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
2040 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
2041 }
2042
2043 y.store( i , y.load(i ) - xmm1 );
2044 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2045 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2046 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4 );
2047 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5 );
2048 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6 );
2049 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7 );
2050 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8 );
2051 }
2052
2053 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2054 {
2055 SIMDType x1( set( x[jj] ) );
2056 SIMDType xmm1( A.load(i ,jj) * x1 );
2057 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
2058 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
2059 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jj) * x1 );
2060
2061 for( size_t j=jj+1UL; j<jend; ++j ) {
2062 x1 = set( x[j] );
2063 xmm1 += A.load(i ,j) * x1;
2064 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2065 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2066 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
2067 }
2068
2069 y.store( i , y.load(i ) - xmm1 );
2070 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2071 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2072 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4 );
2073 }
2074
2075 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2076 {
2077 SIMDType x1( set( x[jj] ) );
2078 SIMDType xmm1( A.load(i ,jj) * x1 );
2079 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
2080 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
2081
2082 for( size_t j=jj+1UL; j<jend; ++j ) {
2083 x1 = set( x[j] );
2084 xmm1 += A.load(i ,j) * x1;
2085 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2086 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2087 }
2088
2089 y.store( i , y.load(i ) - xmm1 );
2090 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2091 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2092 }
2093
2094 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2095 {
2096 SIMDType x1( set( x[jj] ) );
2097 SIMDType xmm1( A.load(i ,jj) * x1 );
2098 SIMDType xmm2( A.load(i+SIMDSIZE,jj) * x1 );
2099
2100 for( size_t j=jj+1UL; j<jend; ++j ) {
2101 x1 = set( x[j] );
2102 xmm1 += A.load(i ,j) * x1;
2103 xmm2 += A.load(i+SIMDSIZE,j) * x1;
2104 }
2105
2106 y.store( i , y.load(i ) - xmm1 );
2107 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2 );
2108 }
2109
2110 for( ; i<ipos; i+=SIMDSIZE )
2111 {
2112 SIMDType xmm1( A.load(i,jj) * set( x[jj] ) );
2113
2114 for( size_t j=jj+1UL; j<jend; ++j ) {
2115 xmm1 += A.load(i,j) * set( x[j] );
2116 }
2117
2118 y.store( i, y.load(i) - xmm1 );
2119 }
2120
2121 for( ; remainder && i<iend; ++i )
2122 {
2123 ElementType value( A(i,jj) * x[jj] );
2124
2125 for( size_t j=jj+1UL; j<jend; ++j ) {
2126 value += A(i,j) * x[j];
2127 }
2128
2129 y[i] -= value;
2130 }
2131 }
2132 }
2133 }
2135 //**********************************************************************************************
2136
2137 //**BLAS-based subtraction assignment to dense vectors (default)********************************
2151 template< typename VT1 // Type of the left-hand side target vector
2152 , typename MT1 // Type of the left-hand side matrix operand
2153 , typename VT2 > // Type of the right-hand side vector operand
2154 static inline auto selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2155 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2156 {
2157 selectLargeSubAssignKernel( y, A, x );
2158 }
2160 //**********************************************************************************************
2161
2162 //**BLAS-based subtraction assignment to dense vectors******************************************
2163#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2177 template< typename VT1 // Type of the left-hand side target vector
2178 , typename MT1 // Type of the left-hand side matrix operand
2179 , typename VT2 > // Type of the right-hand side vector operand
2180 static inline auto selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2181 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2182 {
2183 using ET = ElementType_t<VT1>;
2184
2185 if( IsTriangular_v<MT1> ) {
2186 ResultType_t<VT1> tmp( serial( x ) );
2187 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2188 subAssign( y, tmp );
2189 }
2190 else {
2191 gemv( y, A, x, ET(-1), ET(1) );
2192 }
2193 }
2195#endif
2196 //**********************************************************************************************
2197
2198 //**Subtraction assignment to sparse vectors****************************************************
2199 // No special implementation for the subtraction assignment to sparse vectors.
2200 //**********************************************************************************************
2201
2202 //**Multiplication assignment to dense vectors**************************************************
2215 template< typename VT1 > // Type of the target dense vector
2216 friend inline void multAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2217 {
2219
2223
2224 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2225
2226 const ResultType tmp( serial( rhs ) );
2227 multAssign( *lhs, tmp );
2228 }
2230 //**********************************************************************************************
2231
2232 //**Multiplication assignment to sparse vectors*************************************************
2233 // No special implementation for the multiplication assignment to sparse vectors.
2234 //**********************************************************************************************
2235
2236 //**Division assignment to dense vectors********************************************************
2249 template< typename VT1 > // Type of the target dense vector
2250 friend inline void divAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2251 {
2253
2257
2258 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2259
2260 const ResultType tmp( serial( rhs ) );
2261 divAssign( *lhs, tmp );
2262 }
2264 //**********************************************************************************************
2265
2266 //**Division assignment to sparse vectors*******************************************************
2267 // No special implementation for the division assignment to sparse vectors.
2268 //**********************************************************************************************
2269
2270 //**SMP assignment to dense vectors*************************************************************
2285 template< typename VT1 > // Type of the target dense vector
2286 friend inline auto smpAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2287 -> EnableIf_t< UseSMPAssign_v<VT1> >
2288 {
2290
2291 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2292
2293 if( rhs.mat_.rows() == 0UL ) {
2294 return;
2295 }
2296 else if( rhs.mat_.columns() == 0UL ||
2297 ( IsStrictlyTriangular_v<MT> && rhs.mat_.columns() == 1UL ) ) {
2298 reset( *lhs );
2299 return;
2300 }
2301
2302 LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2303 RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2304
2305 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2306 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2307 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2308 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
2309
2310 smpAssign( *lhs, A * x );
2311 }
2313 //**********************************************************************************************
2314
2315 //**SMP assignment to sparse vectors************************************************************
2330 template< typename VT1 > // Type of the target sparse vector
2331 friend inline auto smpAssign( SparseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2332 -> EnableIf_t< UseSMPAssign_v<VT1> >
2333 {
2335
2339
2340 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2341
2342 const ResultType tmp( rhs );
2343 smpAssign( *lhs, tmp );
2344 }
2346 //**********************************************************************************************
2347
2348 //**SMP addition assignment to dense vectors****************************************************
2363 template< typename VT1 > // Type of the target dense vector
2364 friend inline auto smpAddAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2365 -> EnableIf_t< UseSMPAssign_v<VT1> >
2366 {
2368
2369 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2370
2371 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2372 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2373 return;
2374 }
2375
2376 LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2377 RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2378
2379 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2380 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2381 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2382 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
2383
2384 smpAddAssign( *lhs, A * x );
2385 }
2387 //**********************************************************************************************
2388
2389 //**SMP addition assignment to sparse vectors***************************************************
2390 // No special implementation for the SMP addition assignment to sparse vectors.
2391 //**********************************************************************************************
2392
2393 //**SMP subtraction assignment to dense vectors*************************************************
2408 template< typename VT1 > // Type of the target dense vector
2409 friend inline auto smpSubAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2410 -> EnableIf_t< UseSMPAssign_v<VT1> >
2411 {
2413
2414 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2415
2416 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2417 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2418 return;
2419 }
2420
2421 LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2422 RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2423
2424 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2425 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2426 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2427 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
2428
2429 smpSubAssign( *lhs, A * x );
2430 }
2432 //**********************************************************************************************
2433
2434 //**SMP subtraction assignment to sparse vectors************************************************
2435 // No special implementation for the SMP subtraction assignment to sparse vectors.
2436 //**********************************************************************************************
2437
2438 //**SMP multiplication assignment to dense vectors**********************************************
2453 template< typename VT1 > // Type of the target dense vector
2454 friend inline auto smpMultAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2455 -> EnableIf_t< UseSMPAssign_v<VT1> >
2456 {
2458
2462
2463 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2464
2465 const ResultType tmp( rhs );
2466 smpMultAssign( *lhs, tmp );
2467 }
2469 //**********************************************************************************************
2470
2471 //**SMP multiplication assignment to sparse vectors*********************************************
2472 // No special implementation for the SMP multiplication assignment to sparse vectors.
2473 //**********************************************************************************************
2474
2475 //**SMP division assignment to dense vectors****************************************************
2490 template< typename VT1 > // Type of the target dense vector
2491 friend inline auto smpDivAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2492 -> EnableIf_t< UseSMPAssign_v<VT1> >
2493 {
2495
2499
2500 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2501
2502 const ResultType tmp( rhs );
2503 smpDivAssign( *lhs, tmp );
2504 }
2506 //**********************************************************************************************
2507
2508 //**SMP division assignment to sparse vectors***************************************************
2509 // No special implementation for the SMP division assignment to sparse vectors.
2510 //**********************************************************************************************
2511
2512 //**Compile time checks*************************************************************************
2520 //**********************************************************************************************
2521};
2522//*************************************************************************************************
2523
2524
2525
2526
2527//=================================================================================================
2528//
2529// DVECSCALARMULTEXPR SPECIALIZATION
2530//
2531//=================================================================================================
2532
2533//*************************************************************************************************
2542template< typename MT // Type of the left-hand side dense matrix
2543 , typename VT // Type of the right-hand side dense vector
2544 , typename ST > // Type of the side scalar value
2545class DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >
2546 : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false > >
2547 , private Computation
2548{
2549 private:
2550 //**Type definitions****************************************************************************
2551 using MVM = TDMatDVecMultExpr<MT,VT>;
2552 using RES = ResultType_t<MVM>;
2553 using MRT = ResultType_t<MT>;
2554 using VRT = ResultType_t<VT>;
2555 using MET = ElementType_t<MRT>;
2556 using VET = ElementType_t<VRT>;
2557 using MCT = CompositeType_t<MT>;
2558 using VCT = CompositeType_t<VT>;
2559 //**********************************************************************************************
2560
2561 //**********************************************************************************************
2563 static constexpr bool evaluateMatrix =
2564 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2565 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2566 //**********************************************************************************************
2567
2568 //**********************************************************************************************
2570 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
2571 //**********************************************************************************************
2572
2573 //**********************************************************************************************
2575
2578 template< typename T1 >
2579 static constexpr bool UseSMPAssign_v = ( evaluateMatrix || evaluateVector );
2580 //**********************************************************************************************
2581
2582 //**********************************************************************************************
2584
2586 template< typename T1, typename T2, typename T3, typename T4 >
2587 static constexpr bool UseBlasKernel_v =
2589 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2590 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2591 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2592 !IsDiagonal_v<T2> &&
2593 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2594 IsBLASCompatible_v< ElementType_t<T1> > &&
2595 IsBLASCompatible_v< ElementType_t<T2> > &&
2596 IsBLASCompatible_v< ElementType_t<T3> > &&
2597 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2598 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2599 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2600 //**********************************************************************************************
2601
2602 //**********************************************************************************************
2604
2607 template< typename T1, typename T2, typename T3, typename T4 >
2608 static constexpr bool UseVectorizedDefaultKernel_v =
2609 ( useOptimizedKernels &&
2610 !IsDiagonal_v<T2> &&
2611 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2612 IsSIMDCombinable_v< ElementType_t<T1>
2613 , ElementType_t<T2>
2614 , ElementType_t<T3>
2615 , T4 > &&
2616 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2617 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2618 //**********************************************************************************************
2619
2620 public:
2621 //**Type definitions****************************************************************************
2623 using This = DVecScalarMultExpr<MVM,ST,false>;
2624
2626 using BaseType = VecScalarMultExpr< DenseVector<This,false> >;
2627
2628 using ResultType = MultTrait_t<RES,ST>;
2629 using TransposeType = TransposeType_t<ResultType>;
2630 using ElementType = ElementType_t<ResultType>;
2631 using SIMDType = SIMDTrait_t<ElementType>;
2632 using ReturnType = const ElementType;
2633 using CompositeType = const ResultType;
2634
2636 using LeftOperand = const TDMatDVecMultExpr<MT,VT>;
2637
2639 using RightOperand = ST;
2640
2642 using LT = If_t< evaluateMatrix, const MRT, MCT >;
2643
2645 using RT = If_t< evaluateVector, const VRT, VCT >;
2646 //**********************************************************************************************
2647
2648 //**Compilation flags***************************************************************************
2650 static constexpr bool simdEnabled =
2651 ( !IsDiagonal_v<MT> &&
2652 MT::simdEnabled && VT::simdEnabled &&
2653 IsSIMDCombinable_v<MET,VET,ST> &&
2654 HasSIMDAdd_v<MET,VET> &&
2655 HasSIMDMult_v<MET,VET> );
2656
2658 static constexpr bool smpAssignable =
2659 ( !evaluateMatrix && MT::smpAssignable && !evaluateVector && VT::smpAssignable );
2660 //**********************************************************************************************
2661
2662 //**SIMD properties*****************************************************************************
2664 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
2665 //**********************************************************************************************
2666
2667 //**Constructor*********************************************************************************
2673 inline DVecScalarMultExpr( const MVM& vector, ST scalar )
2674 : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2675 , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2676 {}
2677 //**********************************************************************************************
2678
2679 //**Subscript operator**************************************************************************
2685 inline ReturnType operator[]( size_t index ) const {
2686 BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2687 return vector_[index] * scalar_;
2688 }
2689 //**********************************************************************************************
2690
2691 //**At function*********************************************************************************
2698 inline ReturnType at( size_t index ) const {
2699 if( index >= vector_.size() ) {
2700 BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2701 }
2702 return (*this)[index];
2703 }
2704 //**********************************************************************************************
2705
2706 //**Size function*******************************************************************************
2711 inline size_t size() const {
2712 return vector_.size();
2713 }
2714 //**********************************************************************************************
2715
2716 //**Left operand access*************************************************************************
2721 inline LeftOperand leftOperand() const {
2722 return vector_;
2723 }
2724 //**********************************************************************************************
2725
2726 //**Right operand access************************************************************************
2731 inline RightOperand rightOperand() const {
2732 return scalar_;
2733 }
2734 //**********************************************************************************************
2735
2736 //**********************************************************************************************
2742 template< typename T >
2743 inline bool canAlias( const T* alias ) const {
2744 return vector_.canAlias( alias );
2745 }
2746 //**********************************************************************************************
2747
2748 //**********************************************************************************************
2754 template< typename T >
2755 inline bool isAliased( const T* alias ) const {
2756 return vector_.isAliased( alias );
2757 }
2758 //**********************************************************************************************
2759
2760 //**********************************************************************************************
2765 inline bool isAligned() const {
2766 return vector_.isAligned();
2767 }
2768 //**********************************************************************************************
2769
2770 //**********************************************************************************************
2775 inline bool canSMPAssign() const noexcept {
2776 LeftOperand_t<MVM> A( vector_.leftOperand() );
2777 return ( !BLAZE_BLAS_MODE ||
2780 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2781 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
2782 ( size() > SMP_TDMATDVECMULT_THRESHOLD );
2783 }
2784 //**********************************************************************************************
2785
2786 private:
2787 //**Member variables****************************************************************************
2790 //**********************************************************************************************
2791
2792 //**Assignment to dense vectors*****************************************************************
2804 template< typename VT1 > // Type of the target dense vector
2805 friend inline void assign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
2806 {
2808
2809 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2810
2811 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
2812 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
2813
2814 if( left.rows() == 0UL ) {
2815 return;
2816 }
2817 else if( left.columns() == 0UL ||
2818 ( IsStrictlyTriangular_v<MT> && left.columns() == 1UL ) ) {
2819 reset( *lhs );
2820 return;
2821 }
2822
2823 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
2824 RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
2825
2826 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2827 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
2828 BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
2829 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
2830
2831 DVecScalarMultExpr::selectAssignKernel( *lhs, A, x, rhs.scalar_ );
2832 }
2833 //**********************************************************************************************
2834
2835 //**Assignment to dense vectors (kernel selection)**********************************************
2846 template< typename VT1 // Type of the left-hand side target vector
2847 , typename MT1 // Type of the left-hand side matrix operand
2848 , typename VT2 // Type of the right-hand side vector operand
2849 , typename ST2 > // Type of the scalar value
2850 static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
2851 {
2852 if( ( IsDiagonal_v<MT1> ) ||
2853 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2854 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
2855 selectSmallAssignKernel( y, A, x, scalar );
2856 else
2857 selectBlasAssignKernel( y, A, x, scalar );
2858 }
2859 //**********************************************************************************************
2860
2861 //**Default assignment to dense vectors*********************************************************
2875 template< typename VT1 // Type of the left-hand side target vector
2876 , typename MT1 // Type of the left-hand side matrix operand
2877 , typename VT2 // Type of the right-hand side vector operand
2878 , typename ST2 > // Type of the scalar value
2879 static inline void selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
2880 {
2881 const size_t M( A.rows() );
2882 const size_t N( A.columns() );
2883
2884 if( IsStrictlyLower_v<MT1> ) {
2885 reset( y[0] );
2886 }
2887
2888 if( !IsUpper_v<MT1> )
2889 {
2890 for( size_t i=( IsStrictlyLower_v<MT1> ? 1UL : 0UL ); i<M; ++i ) {
2891 y[i] = A(i,0UL) * x[0UL];
2892 }
2893 }
2894
2895 for( size_t j=( IsUpper_v<MT1> && !IsStrictlyUpper_v<MT1> ? 0UL : 1UL ); j<N; ++j )
2896 {
2897 if( IsDiagonal_v<MT1> )
2898 {
2899 y[j] = A(j,j) * x[j] * scalar;
2900 }
2901 else
2902 {
2903 const size_t ibegin( ( IsLower_v<MT1> )
2904 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2905 :( 0UL ) );
2906 const size_t iend( ( IsUpper_v<MT1> )
2907 ?( IsStrictlyUpper_v<MT1> ? j-1UL : j )
2908 :( M ) );
2909 BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2910
2911 const size_t inum( iend - ibegin );
2912 const size_t ipos( ibegin + prevMultiple( inum, 2UL ) );
2913 BLAZE_INTERNAL_ASSERT( ipos <= ibegin+inum, "Invalid end calculation" );
2914
2915 for( size_t i=ibegin; i<ipos; i+=2UL ) {
2916 y[i ] += A(i ,j) * x[j];
2917 y[i+1UL] += A(i+1UL,j) * x[j];
2918 }
2919 if( ipos < iend ) {
2920 y[ipos] += A(ipos,j) * x[j];
2921 }
2922 if( IsUpper_v<MT1> ) {
2923 y[iend] = A(iend,j) * x[j];
2924 }
2925 }
2926 }
2927
2928 if( IsStrictlyUpper_v<MT1> ) {
2929 reset( y[M-1UL] );
2930 }
2931
2932 if( !IsDiagonal_v<MT1> )
2933 {
2934 const size_t iend( IsStrictlyUpper_v<MT1> ? M-1UL : M );
2935 for( size_t i=( IsStrictlyLower_v<MT1> ? 1UL : 0UL ); i<iend; ++i ) {
2936 y[i] *= scalar;
2937 }
2938 }
2939 }
2940 //**********************************************************************************************
2941
2942 //**Default assignment to dense vectors (small matrices)****************************************
2956 template< typename VT1 // Type of the left-hand side target vector
2957 , typename MT1 // Type of the left-hand side matrix operand
2958 , typename VT2 // Type of the right-hand side vector operand
2959 , typename ST2 > // Type of the scalar value
2960 static inline auto selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
2961 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
2962 {
2963 selectDefaultAssignKernel( y, A, x, scalar );
2964 }
2965 //**********************************************************************************************
2966
2967 //**Vectorized default assignment to dense vectors (small matrices)*****************************
2981 template< typename VT1 // Type of the left-hand side target vector
2982 , typename MT1 // Type of the left-hand side matrix operand
2983 , typename VT2 // Type of the right-hand side vector operand
2984 , typename ST2 > // Type of the scalar value
2985 static inline auto selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
2986 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
2987 {
2988 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
2989
2990 const size_t M( A.rows() );
2991 const size_t N( A.columns() );
2992
2993 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
2994 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
2995
2996 const SIMDType factor( set( scalar ) );
2997
2998 size_t i( 0UL );
2999
3000 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3001 {
3002 const size_t jbegin( ( IsUpper_v<MT1> )
3003 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3004 :( 0UL ) );
3005 const size_t jend( ( IsLower_v<MT1> )
3006 ?( min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3007 :( N ) );
3008 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3009
3010 SIMDType x1( set( x[jbegin] ) );
3011 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3012 SIMDType xmm2( A.load(i+SIMDSIZE ,jbegin) * x1 );
3013 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jbegin) * x1 );
3014 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jbegin) * x1 );
3015 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,jbegin) * x1 );
3016 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,jbegin) * x1 );
3017 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,jbegin) * x1 );
3018 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,jbegin) * x1 );
3019
3020 for( size_t j=jbegin+1UL; j<jend; ++j ) {
3021 x1 = set( x[j] );
3022 xmm1 += A.load(i ,j) * x1;
3023 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3024 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3025 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3026 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3027 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3028 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3029 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3030 }
3031
3032 y.store( i , xmm1*factor );
3033 y.store( i+SIMDSIZE , xmm2*factor );
3034 y.store( i+SIMDSIZE*2UL, xmm3*factor );
3035 y.store( i+SIMDSIZE*3UL, xmm4*factor );
3036 y.store( i+SIMDSIZE*4UL, xmm5*factor );
3037 y.store( i+SIMDSIZE*5UL, xmm6*factor );
3038 y.store( i+SIMDSIZE*6UL, xmm7*factor );
3039 y.store( i+SIMDSIZE*7UL, xmm8*factor );
3040 }
3041
3042 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3043 {
3044 const size_t jbegin( ( IsUpper_v<MT1> )
3045 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3046 :( 0UL ) );
3047 const size_t jend( ( IsLower_v<MT1> )
3048 ?( min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3049 :( N ) );
3050 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3051
3052 SIMDType x1( set( x[jbegin] ) );
3053 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3054 SIMDType xmm2( A.load(i+SIMDSIZE ,jbegin) * x1 );
3055 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jbegin) * x1 );
3056 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jbegin) * x1 );
3057
3058 for( size_t j=jbegin+1UL; j<jend; ++j ) {
3059 x1 = set( x[j] );
3060 xmm1 += A.load(i ,j) * x1;
3061 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3062 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3063 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3064 }
3065
3066 y.store( i , xmm1*factor );
3067 y.store( i+SIMDSIZE , xmm2*factor );
3068 y.store( i+SIMDSIZE*2UL, xmm3*factor );
3069 y.store( i+SIMDSIZE*3UL, xmm4*factor );
3070 }
3071
3072 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3073 {
3074 const size_t jbegin( ( IsUpper_v<MT1> )
3075 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3076 :( 0UL ) );
3077 const size_t jend( ( IsLower_v<MT1> )
3078 ?( min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3079 :( N ) );
3080 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3081
3082 SIMDType x1( set( x[jbegin] ) );
3083 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3084 SIMDType xmm2( A.load(i+SIMDSIZE ,jbegin) * x1 );
3085 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jbegin) * x1 );
3086
3087 for( size_t j=jbegin+1UL; j<jend; ++j ) {
3088 x1 = set( x[j] );
3089 xmm1 += A.load(i ,j) * x1;
3090 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3091 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3092 }
3093
3094 y.store( i , xmm1*factor );
3095 y.store( i+SIMDSIZE , xmm2*factor );
3096 y.store( i+SIMDSIZE*2UL, xmm3*factor );
3097 }
3098
3099 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3100 {
3101 const size_t jbegin( ( IsUpper_v<MT1> )
3102 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3103 :( 0UL ) );
3104 const size_t jend( ( IsLower_v<MT1> )
3105 ?( min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3106 :( N ) );
3107 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3108
3109 SIMDType x1( set( x[jbegin] ) );
3110 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3111 SIMDType xmm2( A.load(i+SIMDSIZE,jbegin) * x1 );
3112
3113 for( size_t j=jbegin+1UL; j<jend; ++j ) {
3114 x1 = set( x[j] );
3115 xmm1 += A.load(i ,j) * x1;
3116 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3117 }
3118
3119 y.store( i , xmm1*factor );
3120 y.store( i+SIMDSIZE, xmm2*factor );
3121 }
3122
3123 for( ; i<ipos; i+=SIMDSIZE )
3124 {
3125 const size_t jbegin( ( IsUpper_v<MT1> )
3126 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3127 :( 0UL ) );
3128 const size_t jend( ( IsLower_v<MT1> )
3129 ?( min( i+SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3130 :( N ) );
3131 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3132
3133 SIMDType xmm1( A.load(i,jbegin) * set( x[jbegin] ) );
3134
3135 for( size_t j=jbegin+1UL; j<jend; ++j ) {
3136 xmm1 += A.load(i,j) * set( x[j] );
3137 }
3138
3139 y.store( i, xmm1*factor );
3140 }
3141
3142 for( ; remainder && i<M; ++i )
3143 {
3144 const size_t jbegin( ( IsUpper_v<MT1> )?( i ):( 0UL ) );
3145 const size_t jend( ( IsLower_v<MT1> )?( min( i+1UL, N ) ):( N ) );
3146 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3147
3148 ElementType value( A(i,jbegin) * x[jbegin] );
3149
3150 for( size_t j=jbegin+1UL; j<jend; ++j ) {
3151 value += A(i,j) * x[j];
3152 }
3153
3154 y[i] = value * scalar;
3155 }
3156 }
3157 //**********************************************************************************************
3158
3159 //**Default assignment to dense vectors (large matrices)****************************************
3173 template< typename VT1 // Type of the left-hand side target vector
3174 , typename MT1 // Type of the left-hand side matrix operand
3175 , typename VT2 // Type of the right-hand side vector operand
3176 , typename ST2 > // Type of the scalar value
3177 static inline auto selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3178 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3179 {
3180 selectDefaultAssignKernel( y, A, x, scalar );
3181 }
3182 //**********************************************************************************************
3183
3184 //**Vectorized default assignment to dense vectors (large matrices)*****************************
3198 template< typename VT1 // Type of the left-hand side target vector
3199 , typename MT1 // Type of the left-hand side matrix operand
3200 , typename VT2 // Type of the right-hand side vector operand
3201 , typename ST2 > // Type of the scalar value
3202 static inline auto selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3203 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3204 {
3205 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
3206
3207 const size_t M( A.rows() );
3208 const size_t N( A.columns() );
3209
3210 const size_t iblock( 32768UL / sizeof( ElementType ) );
3211 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3212
3213 BLAZE_INTERNAL_ASSERT( ( iblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3214
3215 const SIMDType factor( set( scalar ) );
3216
3217 reset( y );
3218
3219 for( size_t ii=0U; ii<M; ii+=iblock ) {
3220 for( size_t jj=0UL; jj<N; jj+=jblock )
3221 {
3222 const size_t jend( min( jj+jblock, N ) );
3223 const size_t itmp( min( ii+iblock, M ) );
3224 const size_t iend( ( IsUpper_v<MT1> )
3225 ?( min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
3226 :( itmp ) );
3227
3228 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
3229 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
3230
3231 size_t i( ( IsLower_v<MT1> )
3232 ?( max( ii, prevMultiple( ( IsStrictlyLower_v<MT1> ? jj+1UL : jj ), SIMDSIZE ) ) )
3233 :( ii ) );
3234
3235 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3236 {
3237 SIMDType x1( set( x[jj] ) );
3238 SIMDType xmm1( A.load(i ,jj) * x1 );
3239 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
3240 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
3241 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jj) * x1 );
3242 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,jj) * x1 );
3243 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,jj) * x1 );
3244 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,jj) * x1 );
3245 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,jj) * x1 );
3246
3247 for( size_t j=jj+1UL; j<jend; ++j ) {
3248 x1 = set( x[j] );
3249 xmm1 += A.load(i ,j) * x1;
3250 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3251 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3252 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3253 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3254 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3255 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3256 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3257 }
3258
3259 y.store( i , y.load(i ) + xmm1*factor );
3260 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3261 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3262 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3263 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3264 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3265 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3266 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3267 }
3268
3269 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3270 {
3271 SIMDType x1( set( x[jj] ) );
3272 SIMDType xmm1( A.load(i ,jj) * x1 );
3273 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
3274 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
3275 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jj) * x1 );
3276
3277 for( size_t j=jj+1UL; j<jend; ++j ) {
3278 x1 = set( x[j] );
3279 xmm1 += A.load(i ,j) * x1;
3280 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3281 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3282 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3283 }
3284
3285 y.store( i , y.load(i ) + xmm1*factor );
3286 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3287 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3288 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3289 }
3290
3291 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3292 {
3293 SIMDType x1( set( x[jj] ) );
3294 SIMDType xmm1( A.load(i ,jj) * x1 );
3295 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
3296 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
3297
3298 for( size_t j=jj+1UL; j<jend; ++j ) {
3299 x1 = set( x[j] );
3300 xmm1 += A.load(i ,j) * x1;
3301 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3302 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3303 }
3304
3305 y.store( i , y.load(i ) + xmm1*factor );
3306 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3307 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3308 }
3309
3310 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3311 {
3312 SIMDType x1( set( x[jj] ) );
3313 SIMDType xmm1( A.load(i ,jj) * x1 );
3314 SIMDType xmm2( A.load(i+SIMDSIZE,jj) * x1 );
3315
3316 for( size_t j=jj+1UL; j<jend; ++j ) {
3317 x1 = set( x[j] );
3318 xmm1 += A.load(i ,j) * x1;
3319 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3320 }
3321
3322 y.store( i , y.load(i ) + xmm1*factor );
3323 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3324 }
3325
3326 for( ; i<ipos; i+=SIMDSIZE )
3327 {
3328 SIMDType xmm1( A.load(i,jj) * set( x[jj] ) );
3329
3330 for( size_t j=jj+1UL; j<jend; ++j ) {
3331 xmm1 += A.load(i,j) * set( x[j] );
3332 }
3333
3334 y.store( i, y.load(i) + xmm1*factor );
3335 }
3336
3337 for( ; remainder && i<iend; ++i )
3338 {
3339 ElementType value( A(i,jj) * x[jj] );
3340
3341 for( size_t j=jj+1UL; j<jend; ++j ) {
3342 value += A(i,j) * x[j];
3343 }
3344
3345 y[i] += value * scalar;
3346 }
3347 }
3348 }
3349 }
3350 //**********************************************************************************************
3351
3352 //**BLAS-based assignment to dense vectors (default)********************************************
3366 template< typename VT1 // Type of the left-hand side target vector
3367 , typename MT1 // Type of the left-hand side matrix operand
3368 , typename VT2 // Type of the right-hand side vector operand
3369 , typename ST2 > // Type of the scalar value
3370 static inline auto selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3371 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3372 {
3373 selectLargeAssignKernel( y, A, x, scalar );
3374 }
3375 //**********************************************************************************************
3376
3377 //**BLAS-based assignment to dense vectors******************************************************
3378#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3392 template< typename VT1 // Type of the left-hand side target vector
3393 , typename MT1 // Type of the left-hand side matrix operand
3394 , typename VT2 // Type of the right-hand side vector operand
3395 , typename ST2 > // Type of the scalar value
3396 static inline auto selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3397 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3398 {
3399 using ET = ElementType_t<VT1>;
3400
3401 if( IsTriangular_v<MT1> ) {
3402 assign( y, scalar * x );
3403 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3404 }
3405 else {
3406 gemv( y, A, x, ET(scalar), ET(0) );
3407 }
3408 }
3409#endif
3410 //**********************************************************************************************
3411
3412 //**Assignment to sparse vectors****************************************************************
3424 template< typename VT1 > // Type of the target sparse vector
3425 friend inline void assign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3426 {
3428
3432
3433 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
3434
3435 const ResultType tmp( serial( rhs ) );
3436 assign( *lhs, tmp );
3437 }
3438 //**********************************************************************************************
3439
3440 //**Addition assignment to dense vectors********************************************************
3452 template< typename VT1 > // Type of the target dense vector
3453 friend inline void addAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3454 {
3456
3457 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
3458
3459 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
3460 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
3461
3462 if( left.rows() == 0UL || left.columns() == 0UL ||
3463 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
3464 return;
3465 }
3466
3467 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
3468 RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3469
3470 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3471 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3472 BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3473 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
3474
3475 DVecScalarMultExpr::selectAddAssignKernel( *lhs, A, x, rhs.scalar_ );
3476 }
3477 //**********************************************************************************************
3478
3479 //**Addition assignment to dense vectors (kernel selection)*************************************
3490 template< typename VT1 // Type of the left-hand side target vector
3491 , typename MT1 // Type of the left-hand side matrix operand
3492 , typename VT2 // Type of the right-hand side vector operand
3493 , typename ST2 > // Type of the scalar value
3494 static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3495 {
3496 if( ( IsDiagonal_v<MT1> ) ||
3497 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3498 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3499 selectSmallAddAssignKernel( y, A, x, scalar );
3500 else
3501 selectBlasAddAssignKernel( y, A, x, scalar );
3502 }
3503 //**********************************************************************************************
3504
3505 //**Default addition assignment to dense vectors************************************************
3519 template< typename VT1 // Type of the left-hand side target vector
3520 , typename MT1 // Type of the left-hand side matrix operand
3521 , typename VT2 // Type of the right-hand side vector operand
3522 , typename ST2 > // Type of the scalar value
3523 static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3524 {
3525 y.addAssign( A * x * scalar );
3526 }
3527 //**********************************************************************************************
3528
3529 //**Default addition assignment to dense vectors (small matrices)*******************************
3543 template< typename VT1 // Type of the left-hand side target vector
3544 , typename MT1 // Type of the left-hand side matrix operand
3545 , typename VT2 // Type of the right-hand side vector operand
3546 , typename ST2 > // Type of the scalar value
3547 static inline auto selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3548 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3549 {
3550 selectDefaultAddAssignKernel( y, A, x, scalar );
3551 }
3552 //**********************************************************************************************
3553
3554 //**Vectorized default addition assignment to dense vectors (small matrices)********************
3569 template< typename VT1 // Type of the left-hand side target vector
3570 , typename MT1 // Type of the left-hand side matrix operand
3571 , typename VT2 // Type of the right-hand side vector operand
3572 , typename ST2 > // Type of the scalar value
3573 static inline auto selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3574 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3575 {
3576 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
3577
3578 const size_t M( A.rows() );
3579 const size_t N( A.columns() );
3580
3581 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
3582 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
3583
3584 const SIMDType factor( set( scalar ) );
3585
3586 size_t i( 0UL );
3587
3588 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3589 {
3590 const size_t jbegin( ( IsUpper_v<MT1> )
3591 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3592 :( 0UL ) );
3593 const size_t jend( ( IsLower_v<MT1> )
3594 ?( min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3595 :( N ) );
3596 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3597
3598 SIMDType x1( set( x[jbegin] ) );
3599 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3600 SIMDType xmm2( A.load(i+SIMDSIZE ,jbegin) * x1 );
3601 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jbegin) * x1 );
3602 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jbegin) * x1 );
3603 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,jbegin) * x1 );
3604 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,jbegin) * x1 );
3605 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,jbegin) * x1 );
3606 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,jbegin) * x1 );
3607
3608 for( size_t j=jbegin+1UL; j<jend; ++j ) {
3609 x1 = set( x[j] );
3610 xmm1 += A.load(i ,j) * x1;
3611 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3612 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3613 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3614 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3615 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3616 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3617 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3618 }
3619
3620 y.store( i , y.load(i ) + xmm1*factor );
3621 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3622 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3623 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3624 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3625 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3626 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3627 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3628 }
3629
3630 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3631 {
3632 const size_t jbegin( ( IsUpper_v<MT1> )
3633 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3634 :( 0UL ) );
3635 const size_t jend( ( IsLower_v<MT1> )
3636 ?( min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3637 :( N ) );
3638 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3639
3640 SIMDType x1( set( x[jbegin] ) );
3641 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3642 SIMDType xmm2( A.load(i+SIMDSIZE ,jbegin) * x1 );
3643 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jbegin) * x1 );
3644 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jbegin) * x1 );
3645
3646 for( size_t j=jbegin+1UL; j<jend; ++j ) {
3647 x1 = set( x[j] );
3648 xmm1 += A.load(i ,j) * x1;
3649 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3650 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3651 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3652 }
3653
3654 y.store( i , y.load(i ) + xmm1*factor );
3655 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3656 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3657 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3658 }
3659
3660 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3661 {
3662 const size_t jbegin( ( IsUpper_v<MT1> )
3663 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3664 :( 0UL ) );
3665 const size_t jend( ( IsLower_v<MT1> )
3666 ?( min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3667 :( N ) );
3668 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3669
3670 SIMDType x1( set( x[jbegin] ) );
3671 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3672 SIMDType xmm2( A.load(i+SIMDSIZE ,jbegin) * x1 );
3673 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jbegin) * x1 );
3674
3675 for( size_t j=jbegin+1UL; j<jend; ++j ) {
3676 x1 = set( x[j] );
3677 xmm1 += A.load(i ,j) * x1;
3678 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3679 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3680 }
3681
3682 y.store( i , y.load(i ) + xmm1*factor );
3683 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3684 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3685 }
3686
3687 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3688 {
3689 const size_t jbegin( ( IsUpper_v<MT1> )
3690 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3691 :( 0UL ) );
3692 const size_t jend( ( IsLower_v<MT1> )
3693 ?( min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3694 :( N ) );
3695 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3696
3697 SIMDType x1( set( x[jbegin] ) );
3698 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3699 SIMDType xmm2( A.load(i+SIMDSIZE,jbegin) * x1 );
3700
3701 for( size_t j=jbegin+1UL; j<jend; ++j ) {
3702 x1 = set( x[j] );
3703 xmm1 += A.load(i ,j) * x1;
3704 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3705 }
3706
3707 y.store( i , y.load(i ) + xmm1*factor );
3708 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3709 }
3710
3711 for( ; i<ipos; i+=SIMDSIZE )
3712 {
3713 const size_t jbegin( ( IsUpper_v<MT1> )
3714 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3715 :( 0UL ) );
3716 const size_t jend( ( IsLower_v<MT1> )
3717 ?( min( i+SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3718 :( N ) );
3719 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3720
3721 SIMDType xmm1( A.load(i,jbegin) * set( x[jbegin] ) );
3722
3723 for( size_t j=jbegin+1UL; j<jend; ++j ) {
3724 xmm1 += A.load(i,j) * set( x[j] );
3725 }
3726
3727 y.store( i, y.load(i) + xmm1*factor );
3728 }
3729
3730 for( ; remainder && i<M; ++i )
3731 {
3732 const size_t jbegin( ( IsUpper_v<MT1> )?( i ):( 0UL ) );
3733 const size_t jend( ( IsLower_v<MT1> )?( min( i+1UL, N ) ):( N ) );
3734 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3735
3736 ElementType value( A(i,jbegin) * x[jbegin] );
3737
3738 for( size_t j=jbegin+1UL; j<jend; ++j ) {
3739 value += A(i,j) * x[j];
3740 }
3741
3742 y[i] += value * scalar;
3743 }
3744 }
3745 //**********************************************************************************************
3746
3747 //**Default addition assignment to dense vectors (large matrices)*******************************
3761 template< typename VT1 // Type of the left-hand side target vector
3762 , typename MT1 // Type of the left-hand side matrix operand
3763 , typename VT2 // Type of the right-hand side vector operand
3764 , typename ST2 > // Type of the scalar value
3765 static inline auto selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3766 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3767 {
3768 selectDefaultAddAssignKernel( y, A, x, scalar );
3769 }
3770 //**********************************************************************************************
3771
3772 //**Vectorized default addition assignment to dense vectors (large matrices)********************
3787 template< typename VT1 // Type of the left-hand side target vector
3788 , typename MT1 // Type of the left-hand side matrix operand
3789 , typename VT2 // Type of the right-hand side vector operand
3790 , typename ST2 > // Type of the scalar value
3791 static inline auto selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3792 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3793 {
3794 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
3795
3796 const size_t M( A.rows() );
3797 const size_t N( A.columns() );
3798
3799 const size_t iblock( 32768UL / sizeof( ElementType ) );
3800 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3801
3802 BLAZE_INTERNAL_ASSERT( ( iblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3803
3804 const SIMDType factor( set( scalar ) );
3805
3806 for( size_t ii=0U; ii<M; ii+=iblock ) {
3807 for( size_t jj=0UL; jj<N; jj+=jblock )
3808 {
3809 const size_t jend( min( jj+jblock, N ) );
3810 const size_t itmp( min( ii+iblock, M ) );
3811 const size_t iend( ( IsUpper_v<MT1> )
3812 ?( min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
3813 :( itmp ) );
3814
3815 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
3816 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
3817
3818 size_t i( ( IsLower_v<MT1> )
3819 ?( max( ii, prevMultiple( ( IsStrictlyLower_v<MT1> ? jj+1UL : jj ), SIMDSIZE ) ) )
3820 :( ii ) );
3821
3822 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3823 {
3824 SIMDType x1( set( x[jj] ) );
3825 SIMDType xmm1( A.load(i ,jj) * x1 );
3826 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
3827 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
3828 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jj) * x1 );
3829 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,jj) * x1 );
3830 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,jj) * x1 );
3831 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,jj) * x1 );
3832 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,jj) * x1 );
3833
3834 for( size_t j=jj+1UL; j<jend; ++j ) {
3835 x1 = set( x[j] );
3836 xmm1 += A.load(i ,j) * x1;
3837 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3838 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3839 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3840 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3841 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3842 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3843 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3844 }
3845
3846 y.store( i , y.load(i ) + xmm1*factor );
3847 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3848 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3849 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3850 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3851 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3852 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3853 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3854 }
3855
3856 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3857 {
3858 SIMDType x1( set( x[jj] ) );
3859 SIMDType xmm1( A.load(i ,jj) * x1 );
3860 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
3861 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
3862 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jj) * x1 );
3863
3864 for( size_t j=jj+1UL; j<jend; ++j ) {
3865 x1 = set( x[j] );
3866 xmm1 += A.load(i ,j) * x1;
3867 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3868 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3869 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3870 }
3871
3872 y.store( i , y.load(i ) + xmm1*factor );
3873 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3874 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3875 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3876 }
3877
3878 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3879 {
3880 SIMDType x1( set( x[jj] ) );
3881 SIMDType xmm1( A.load(i ,jj) * x1 );
3882 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
3883 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
3884
3885 for( size_t j=jj+1UL; j<jend; ++j ) {
3886 x1 = set( x[j] );
3887 xmm1 += A.load(i ,j) * x1;
3888 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3889 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3890 }
3891
3892 y.store( i , y.load(i ) + xmm1*factor );
3893 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3894 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3895 }
3896
3897 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3898 {
3899 SIMDType x1( set( x[jj] ) );
3900 SIMDType xmm1( A.load(i ,jj) * x1 );
3901 SIMDType xmm2( A.load(i+SIMDSIZE,jj) * x1 );
3902
3903 for( size_t j=jj+1UL; j<jend; ++j ) {
3904 x1 = set( x[j] );
3905 xmm1 += A.load(i ,j) * x1;
3906 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3907 }
3908
3909 y.store( i , y.load(i ) + xmm1*factor );
3910 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3911 }
3912
3913 for( ; i<ipos; i+=SIMDSIZE )
3914 {
3915 SIMDType xmm1( A.load(i,jj) * set( x[jj] ) );
3916
3917 for( size_t j=jj+1UL; j<jend; ++j ) {
3918 xmm1 += A.load(i,j) * set( x[j] );
3919 }
3920
3921 y.store( i, y.load(i) + xmm1*factor );
3922 }
3923
3924 for( ; remainder && i<iend; ++i )
3925 {
3926 ElementType value( A(i,jj) * x[jj] );
3927
3928 for( size_t j=jj+1UL; j<jend; ++j ) {
3929 value += A(i,j) * x[j];
3930 }
3931
3932 y[i] += value * scalar;
3933 }
3934 }
3935 }
3936 }
3937 //**********************************************************************************************
3938
3939 //**BLAS-based addition assignment to dense vectors (default)***********************************
3953 template< typename VT1 // Type of the left-hand side target vector
3954 , typename MT1 // Type of the left-hand side matrix operand
3955 , typename VT2 // Type of the right-hand side vector operand
3956 , typename ST2 > // Type of the scalar value
3957 static inline auto selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3958 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3959 {
3960 selectLargeAddAssignKernel( y, A, x, scalar );
3961 }
3962 //**********************************************************************************************
3963
3964 //**BLAS-based addition assignment to dense vectors*********************************************
3965#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3979 template< typename VT1 // Type of the left-hand side target vector
3980 , typename MT1 // Type of the left-hand side matrix operand
3981 , typename VT2 // Type of the right-hand side vector operand
3982 , typename ST2 > // Type of the scalar value
3983 static inline auto selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3984 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3985 {
3986 using ET = ElementType_t<VT1>;
3987
3988 if( IsTriangular_v<MT1> ) {
3989 ResultType_t<VT1> tmp( serial( scalar * x ) );
3990 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3991 addAssign( y, tmp );
3992 }
3993 else {
3994 gemv( y, A, x, ET(scalar), ET(1) );
3995 }
3996 }
3997#endif
3998 //**********************************************************************************************
3999
4000 //**Addition assignment to sparse vectors*******************************************************
4001 // No special implementation for the addition assignment to sparse vectors.
4002 //**********************************************************************************************
4003
4004 //**Subtraction assignment to dense vectors*****************************************************
4016 template< typename VT1 > // Type of the target dense vector
4017 friend inline void subAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4018 {
4020
4021 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4022
4023 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4024 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4025
4026 if( left.rows() == 0UL || left.columns() == 0UL ||
4027 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
4028 return;
4029 }
4030
4031 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4032 RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
4033
4034 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4035 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4036 BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4037 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
4038
4039 DVecScalarMultExpr::selectSubAssignKernel( *lhs, A, x, rhs.scalar_ );
4040 }
4041 //**********************************************************************************************
4042
4043 //**Subtraction assignment to dense vectors (kernel selection)**********************************
4054 template< typename VT1 // Type of the left-hand side target vector
4055 , typename MT1 // Type of the left-hand side matrix operand
4056 , typename VT2 // Type of the right-hand side vector operand
4057 , typename ST2 > // Type of the scalar value
4058 static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4059 {
4060 if( ( IsDiagonal_v<MT1> ) ||
4061 ( IsComputation_v<MT> && !evaluateMatrix ) ||
4062 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
4063 selectSmallSubAssignKernel( y, A, x, scalar );
4064 else
4065 selectBlasSubAssignKernel( y, A, x, scalar );
4066 }
4067 //**********************************************************************************************
4068
4069 //**Default subtraction assignment to dense vectors*********************************************
4083 template< typename VT1 // Type of the left-hand side target vector
4084 , typename MT1 // Type of the left-hand side matrix operand
4085 , typename VT2 // Type of the right-hand side vector operand
4086 , typename ST2 > // Type of the scalar value
4087 static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4088 {
4089 y.subAssign( A * x * scalar );
4090 }
4091 //**********************************************************************************************
4092
4093 //**Default subtraction assignment to dense vectors (small matrices)****************************
4107 template< typename VT1 // Type of the left-hand side target vector
4108 , typename MT1 // Type of the left-hand side matrix operand
4109 , typename VT2 // Type of the right-hand side vector operand
4110 , typename ST2 > // Type of the scalar value
4111 static inline auto selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4112 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4113 {
4114 selectDefaultSubAssignKernel( y, A, x, scalar );
4115 }
4116 //**********************************************************************************************
4117
4118 //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
4133 template< typename VT1 // Type of the left-hand side target vector
4134 , typename MT1 // Type of the left-hand side matrix operand
4135 , typename VT2 // Type of the right-hand side vector operand
4136 , typename ST2 > // Type of the scalar value
4137 static inline auto selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4138 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4139 {
4140 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
4141
4142 const size_t M( A.rows() );
4143 const size_t N( A.columns() );
4144
4145 const size_t ipos( remainder ? prevMultiple( M, SIMDSIZE ) : M );
4146 BLAZE_INTERNAL_ASSERT( ipos <= M, "Invalid end calculation" );
4147
4148 const SIMDType factor( set( scalar ) );
4149
4150 size_t i( 0UL );
4151
4152 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
4153 {
4154 const size_t jbegin( ( IsUpper_v<MT1> )
4155 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4156 :( 0UL ) );
4157 const size_t jend( ( IsLower_v<MT1> )
4158 ?( min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4159 :( N ) );
4160 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4161
4162 SIMDType x1( set( x[jbegin] ) );
4163 SIMDType xmm1( A.load(i ,jbegin) * x1 );
4164 SIMDType xmm2( A.load(i+SIMDSIZE ,jbegin) * x1 );
4165 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jbegin) * x1 );
4166 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jbegin) * x1 );
4167 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,jbegin) * x1 );
4168 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,jbegin) * x1 );
4169 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,jbegin) * x1 );
4170 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,jbegin) * x1 );
4171
4172 for( size_t j=jbegin+1UL; j<jend; ++j ) {
4173 x1 = set( x[j] );
4174 xmm1 += A.load(i ,j) * x1;
4175 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4176 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4177 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4178 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
4179 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
4180 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
4181 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
4182 }
4183
4184 y.store( i , y.load(i ) - xmm1*factor );
4185 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4186 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4187 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4188 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5*factor );
4189 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6*factor );
4190 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7*factor );
4191 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8*factor );
4192 }
4193
4194 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4195 {
4196 const size_t jbegin( ( IsUpper_v<MT1> )
4197 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4198 :( 0UL ) );
4199 const size_t jend( ( IsLower_v<MT1> )
4200 ?( min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4201 :( N ) );
4202 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4203
4204 SIMDType x1( set( x[jbegin] ) );
4205 SIMDType xmm1( A.load(i ,jbegin) * x1 );
4206 SIMDType xmm2( A.load(i+SIMDSIZE ,jbegin) * x1 );
4207 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jbegin) * x1 );
4208 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jbegin) * x1 );
4209
4210 for( size_t j=jbegin+1UL; j<jend; ++j ) {
4211 x1 = set( x[j] );
4212 xmm1 += A.load(i ,j) * x1;
4213 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4214 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4215 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4216 }
4217
4218 y.store( i , y.load(i ) - xmm1*factor );
4219 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4220 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4221 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4222 }
4223
4224 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4225 {
4226 const size_t jbegin( ( IsUpper_v<MT1> )
4227 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4228 :( 0UL ) );
4229 const size_t jend( ( IsLower_v<MT1> )
4230 ?( min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4231 :( N ) );
4232 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4233
4234 SIMDType x1( set( x[jbegin] ) );
4235 SIMDType xmm1( A.load(i ,jbegin) * x1 );
4236 SIMDType xmm2( A.load(i+SIMDSIZE ,jbegin) * x1 );
4237 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jbegin) * x1 );
4238
4239 for( size_t j=jbegin+1UL; j<jend; ++j ) {
4240 x1 = set( x[j] );
4241 xmm1 += A.load(i ,j) * x1;
4242 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4243 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4244 }
4245
4246 y.store( i , y.load(i ) - xmm1*factor );
4247 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4248 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4249 }
4250
4251 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4252 {
4253 const size_t jbegin( ( IsUpper_v<MT1> )
4254 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4255 :( 0UL ) );
4256 const size_t jend( ( IsLower_v<MT1> )
4257 ?( min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4258 :( N ) );
4259 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4260
4261 SIMDType x1( set( x[jbegin] ) );
4262 SIMDType xmm1( A.load(i ,jbegin) * x1 );
4263 SIMDType xmm2( A.load(i+SIMDSIZE,jbegin) * x1 );
4264
4265 for( size_t j=jbegin+1UL; j<jend; ++j ) {
4266 x1 = set( x[j] );
4267 xmm1 += A.load(i ,j) * x1;
4268 xmm2 += A.load(i+SIMDSIZE,j) * x1;
4269 }
4270
4271 y.store( i , y.load(i ) - xmm1*factor );
4272 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2*factor );
4273 }
4274
4275 for( ; i<ipos; i+=SIMDSIZE )
4276 {
4277 const size_t jbegin( ( IsUpper_v<MT1> )
4278 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4279 :( 0UL ) );
4280 const size_t jend( ( IsLower_v<MT1> )
4281 ?( min( i+SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4282 :( N ) );
4283 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4284
4285 SIMDType xmm1( A.load(i,jbegin) * set( x[jbegin] ) );
4286
4287 for( size_t j=jbegin+1UL; j<jend; ++j ) {
4288 xmm1 += A.load(i,j) * set( x[j] );
4289 }
4290
4291 y.store( i, y.load(i) - xmm1*factor );
4292 }
4293
4294 for( ; remainder && i<M; ++i )
4295 {
4296 const size_t jbegin( ( IsUpper_v<MT1> )?( i ):( 0UL ) );
4297 const size_t jend( ( IsLower_v<MT1> )?( min( i+1UL, N ) ):( N ) );
4298 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4299
4300 ElementType value( A(i,jbegin) * x[jbegin] );
4301
4302 for( size_t j=jbegin+1UL; j<jend; ++j ) {
4303 value += A(i,j) * x[j];
4304 }
4305
4306 y[i] -= value * scalar;
4307 }
4308 }
4309 //**********************************************************************************************
4310
4311 //**Default subtraction assignment to dense vectors (large matrices)****************************
4325 template< typename VT1 // Type of the left-hand side target vector
4326 , typename MT1 // Type of the left-hand side matrix operand
4327 , typename VT2 // Type of the right-hand side vector operand
4328 , typename ST2 > // Type of the scalar value
4329 static inline auto selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4330 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4331 {
4332 selectDefaultSubAssignKernel( y, A, x, scalar );
4333 }
4334 //**********************************************************************************************
4335
4336 //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4351 template< typename VT1 // Type of the left-hand side target vector
4352 , typename MT1 // Type of the left-hand side matrix operand
4353 , typename VT2 // Type of the right-hand side vector operand
4354 , typename ST2 > // Type of the scalar value
4355 static inline auto selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4356 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4357 {
4358 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
4359
4360 const size_t M( A.rows() );
4361 const size_t N( A.columns() );
4362
4363 const size_t iblock( 32768UL / sizeof( ElementType ) );
4364 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
4365
4366 BLAZE_INTERNAL_ASSERT( ( iblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
4367
4368 const SIMDType factor( set( scalar ) );
4369
4370 for( size_t ii=0U; ii<M; ii+=iblock ) {
4371 for( size_t jj=0UL; jj<N; jj+=jblock )
4372 {
4373 const size_t jend( min( jj+jblock, N ) );
4374 const size_t itmp( min( ii+iblock, M ) );
4375 const size_t iend( ( IsUpper_v<MT1> )
4376 ?( min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
4377 :( itmp ) );
4378
4379 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
4380 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
4381
4382 size_t i( ( IsLower_v<MT1> )
4383 ?( max( ii, prevMultiple( ( IsStrictlyLower_v<MT1> ? jj+1UL : jj ), SIMDSIZE ) ) )
4384 :( ii ) );
4385
4386 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
4387 {
4388 SIMDType x1( set( x[jj] ) );
4389 SIMDType xmm1( A.load(i ,jj) * x1 );
4390 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
4391 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
4392 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jj) * x1 );
4393 SIMDType xmm5( A.load(i+SIMDSIZE*4UL,jj) * x1 );
4394 SIMDType xmm6( A.load(i+SIMDSIZE*5UL,jj) * x1 );
4395 SIMDType xmm7( A.load(i+SIMDSIZE*6UL,jj) * x1 );
4396 SIMDType xmm8( A.load(i+SIMDSIZE*7UL,jj) * x1 );
4397
4398 for( size_t j=jj+1UL; j<jend; ++j ) {
4399 x1 = set( x[j] );
4400 xmm1 += A.load(i ,j) * x1;
4401 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4402 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4403 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4404 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
4405 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
4406 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
4407 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
4408 }
4409
4410 y.store( i , y.load(i ) - xmm1*factor );
4411 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4412 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4413 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4414 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5*factor );
4415 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6*factor );
4416 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7*factor );
4417 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8*factor );
4418 }
4419
4420 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4421 {
4422 SIMDType x1( set( x[jj] ) );
4423 SIMDType xmm1( A.load(i ,jj) * x1 );
4424 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
4425 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
4426 SIMDType xmm4( A.load(i+SIMDSIZE*3UL,jj) * x1 );
4427
4428 for( size_t j=jj+1UL; j<jend; ++j ) {
4429 x1 = set( x[j] );
4430 xmm1 += A.load(i ,j) * x1;
4431 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4432 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4433 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4434 }
4435
4436 y.store( i , y.load(i ) - xmm1*factor );
4437 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4438 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4439 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4440 }
4441
4442 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4443 {
4444 SIMDType x1( set( x[jj] ) );
4445 SIMDType xmm1( A.load(i ,jj) * x1 );
4446 SIMDType xmm2( A.load(i+SIMDSIZE ,jj) * x1 );
4447 SIMDType xmm3( A.load(i+SIMDSIZE*2UL,jj) * x1 );
4448
4449 for( size_t j=jj+1UL; j<jend; ++j ) {
4450 x1 = set( x[j] );
4451 xmm1 += A.load(i ,j) * x1;
4452 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4453 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4454 }
4455
4456 y.store( i , y.load(i ) - xmm1*factor );
4457 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4458 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4459 }
4460
4461 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4462 {
4463 SIMDType x1( set( x[jj] ) );
4464 SIMDType xmm1( A.load(i ,jj) * x1 );
4465 SIMDType xmm2( A.load(i+SIMDSIZE,jj) * x1 );
4466
4467 for( size_t j=jj+1UL; j<jend; ++j ) {
4468 x1 = set( x[j] );
4469 xmm1 += A.load(i ,j) * x1;
4470 xmm2 += A.load(i+SIMDSIZE,j) * x1;
4471 }
4472
4473 y.store( i , y.load(i ) - xmm1*factor );
4474 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2*factor );
4475 }
4476
4477 for( ; i<ipos; i+=SIMDSIZE )
4478 {
4479 SIMDType xmm1( A.load(i,jj) * set( x[jj] ) );
4480
4481 for( size_t j=jj+1UL; j<jend; ++j ) {
4482 xmm1 += A.load(i,j) * set( x[j] );
4483 }
4484
4485 y.store( i, y.load(i) - xmm1*factor );
4486 }
4487
4488 for( ; remainder && i<iend; ++i )
4489 {
4490 ElementType value( A(i,jj) * x[jj] );
4491
4492 for( size_t j=jj+1UL; j<jend; ++j ) {
4493 value += A(i,j) * x[j];
4494 }
4495
4496 y[i] -= value * scalar;
4497 }
4498 }
4499 }
4500 }
4501 //**********************************************************************************************
4502
4503 //**BLAS-based subtraction assignment to dense vectors (default)********************************
4517 template< typename VT1 // Type of the left-hand side target vector
4518 , typename MT1 // Type of the left-hand side matrix operand
4519 , typename VT2 // Type of the right-hand side vector operand
4520 , typename ST2 > // Type of the scalar value
4521 static inline auto selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4522 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4523 {
4524 selectLargeSubAssignKernel( y, A, x, scalar );
4525 }
4526 //**********************************************************************************************
4527
4528 //**BLAS-based subtraction assignment to dense vectors******************************************
4529#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4543 template< typename VT1 // Type of the left-hand side target vector
4544 , typename MT1 // Type of the left-hand side matrix operand
4545 , typename VT2 // Type of the right-hand side vector operand
4546 , typename ST2 > // Type of the scalar value
4547 static inline auto selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4548 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4549 {
4550 using ET = ElementType_t<VT1>;
4551
4552 if( IsTriangular_v<MT1> ) {
4553 ResultType_t<VT1> tmp( serial( scalar * x ) );
4554 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4555 subAssign( y, tmp );
4556 }
4557 else {
4558 gemv( y, A, x, ET(-scalar), ET(1) );
4559 }
4560 }
4561#endif
4562 //**********************************************************************************************
4563
4564 //**Subtraction assignment to sparse vectors****************************************************
4565 // No special implementation for the subtraction assignment to sparse vectors.
4566 //**********************************************************************************************
4567
4568 //**Multiplication assignment to dense vectors**************************************************
4580 template< typename VT1 > // Type of the target dense vector
4581 friend inline void multAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4582 {
4584
4588
4589 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4590
4591 const ResultType tmp( serial( rhs ) );
4592 multAssign( *lhs, tmp );
4593 }
4594 //**********************************************************************************************
4595
4596 //**Multiplication assignment to sparse vectors*************************************************
4597 // No special implementation for the multiplication assignment to sparse vectors.
4598 //**********************************************************************************************
4599
4600 //**Division assignment to dense vectors********************************************************
4612 template< typename VT1 > // Type of the target dense vector
4613 friend inline void divAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4614 {
4616
4620
4621 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4622
4623 const ResultType tmp( serial( rhs ) );
4624 divAssign( *lhs, tmp );
4625 }
4626 //**********************************************************************************************
4627
4628 //**Division assignment to sparse vectors*******************************************************
4629 // No special implementation for the division assignment to sparse vectors.
4630 //**********************************************************************************************
4631
4632 //**SMP assignment to dense vectors*************************************************************
4646 template< typename VT1 > // Type of the target dense vector
4647 friend inline auto smpAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4648 -> EnableIf_t< UseSMPAssign_v<VT1> >
4649 {
4651
4652 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4653
4654 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4655 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4656
4657 if( left.rows() == 0UL ) {
4658 return;
4659 }
4660 else if( left.columns() == 0UL ||
4661 ( IsStrictlyTriangular_v<MT> && left.columns() == 1UL ) ) {
4662 reset( *lhs );
4663 return;
4664 }
4665
4666 LT A( left ); // Evaluation of the left-hand side dense matrix operand
4667 RT x( right ); // Evaluation of the right-hand side dense vector operand
4668
4669 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4670 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4671 BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4672 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
4673
4674 smpAssign( *lhs, A * x * rhs.scalar_ );
4675 }
4676 //**********************************************************************************************
4677
4678 //**SMP assignment to sparse vectors************************************************************
4692 template< typename VT1 > // Type of the target sparse vector
4693 friend inline auto smpAssign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4694 -> EnableIf_t< UseSMPAssign_v<VT1> >
4695 {
4697
4701
4702 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4703
4704 const ResultType tmp( rhs );
4705 smpAssign( *lhs, tmp );
4706 }
4707 //**********************************************************************************************
4708
4709 //**SMP addition assignment to dense vectors****************************************************
4723 template< typename VT1 > // Type of the target dense vector
4724 friend inline auto smpAddAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4725 -> EnableIf_t< UseSMPAssign_v<VT1> >
4726 {
4728
4729 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4730
4731 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4732 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4733
4734 if( left.rows() == 0UL || left.columns() == 0UL ||
4735 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
4736 return;
4737 }
4738
4739 LT A( left ); // Evaluation of the left-hand side dense matrix operand
4740 RT x( right ); // Evaluation of the right-hand side dense vector operand
4741
4742 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4743 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4744 BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4745 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
4746
4747 smpAddAssign( *lhs, A * x * rhs.scalar_ );
4748 }
4749 //**********************************************************************************************
4750
4751 //**SMP addition assignment to sparse vectors***************************************************
4752 // No special implementation for the SMP addition assignment to sparse vectors.
4753 //**********************************************************************************************
4754
4755 //**SMP subtraction assignment to dense vectors*************************************************
4769 template< typename VT1 > // Type of the target dense vector
4770 friend inline auto smpSubAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4771 -> EnableIf_t< UseSMPAssign_v<VT1> >
4772 {
4774
4775 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4776
4777 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4778 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4779
4780 if( left.rows() == 0UL || left.columns() == 0UL ||
4781 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
4782 return;
4783 }
4784
4785 LT A( left ); // Evaluation of the left-hand side dense matrix operand
4786 RT x( right ); // Evaluation of the right-hand side dense vector operand
4787
4788 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4789 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4790 BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4791 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
4792
4793 smpSubAssign( *lhs, A * x * rhs.scalar_ );
4794 }
4795 //**********************************************************************************************
4796
4797 //**SMP subtraction assignment to sparse vectors************************************************
4798 // No special implementation for the SMP subtraction assignment to sparse vectors.
4799 //**********************************************************************************************
4800
4801 //**SMP multiplication assignment to dense vectors**********************************************
4816 template< typename VT1 > // Type of the target dense vector
4817 friend inline auto smpMultAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4818 -> EnableIf_t< UseSMPAssign_v<VT1> >
4819 {
4821
4825
4826 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4827
4828 const ResultType tmp( rhs );
4829 smpMultAssign( *lhs, tmp );
4830 }
4831 //**********************************************************************************************
4832
4833 //**SMP multiplication assignment to sparse vectors*********************************************
4834 // No special implementation for the SMP multiplication assignment to sparse vectors.
4835 //**********************************************************************************************
4836
4837 //**SMP division assignment to dense vectors****************************************************
4851 template< typename VT1 > // Type of the target dense vector
4852 friend inline auto smpDivAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4853 -> EnableIf_t< UseSMPAssign_v<VT1> >
4854 {
4856
4860
4861 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4862
4863 const ResultType tmp( rhs );
4864 smpDivAssign( *lhs, tmp );
4865 }
4866 //**********************************************************************************************
4867
4868 //**SMP division assignment to sparse vectors***************************************************
4869 // No special implementation for the SMP division assignment to sparse vectors.
4870 //**********************************************************************************************
4871
4872 //**Compile time checks*************************************************************************
4881 //**********************************************************************************************
4882};
4884//*************************************************************************************************
4885
4886
4887
4888
4889//=================================================================================================
4890//
4891// GLOBAL BINARY ARITHMETIC OPERATORS
4892//
4893//=================================================================================================
4894
4895//*************************************************************************************************
4926template< typename MT // Type of the left-hand side dense matrix
4927 , typename VT > // Type of the right-hand side dense vector
4928inline decltype(auto)
4929 operator*( const DenseMatrix<MT,true>& mat, const DenseVector<VT,false>& vec )
4930{
4932
4934
4935 if( (*mat).columns() != (*vec).size() ) {
4936 BLAZE_THROW_INVALID_ARGUMENT( "Matrix and vector sizes do not match" );
4937 }
4938
4939 using ReturnType = const TDMatDVecMultExpr<MT,VT>;
4940 return ReturnType( *mat, *vec );
4941}
4942//*************************************************************************************************
4943
4944
4945
4946
4947//=================================================================================================
4948//
4949// ISALIGNED SPECIALIZATIONS
4950//
4951//=================================================================================================
4952
4953//*************************************************************************************************
4955template< typename MT, typename VT >
4956struct IsAligned< TDMatDVecMultExpr<MT,VT> >
4957 : public BoolConstant< IsAligned_v<MT> && IsAligned_v<VT> >
4958{};
4960//*************************************************************************************************
4961
4962} // namespace blaze
4963
4964#endif
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for the blaze::checked and blaze::unchecked instances.
Constraints on the storage order of matrix types.
Constraint on the transpose flag of vector types.
Header file for the complex data type.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Deactivation of problematic macros.
Header file for the multiplication trait.
Header file for the prevMultiple shim.
Header file for all SIMD functionality.
Data type constraint.
Constraint on the data type.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:530
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:430
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:461
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:520
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:540
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:584
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:163
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:169
VecScalarMultExpr< DenseVector< This, TF > > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:166
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:440
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:110
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:552
If_t< useAssign, const ResultType, const DVecScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:176
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:179
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:112
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:170
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:435
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:449
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:182
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:574
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:168
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:564
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:173
Base class for dense matrices.
Definition: DenseMatrix.h:82
Base class for N-dimensional dense vectors.
Definition: DenseVector.h:77
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Expression object for transpose dense matrix-dense vector multiplications.
Definition: TDMatDVecMultExpr.h:128
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDVecMultExpr.h:391
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatDVecMultExpr.h:239
ElementType_t< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:134
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDVecMultExpr.h:369
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatDVecMultExpr.h:245
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDVecMultExpr.h:379
TDMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the TDMatDVecMultExpr class.
Definition: TDMatDVecMultExpr.h:254
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDVecMultExpr.h:213
static constexpr bool evaluateVector
Compilation switch for the composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:148
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatDVecMultExpr.h:232
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDMatDVecMultExpr.h:268
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:218
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDMatDVecMultExpr.h:302
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDVecMultExpr.h:215
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDVecMultExpr.h:347
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDVecMultExpr.h:325
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatDVecMultExpr.h:212
If_t< IsExpression_v< VT >, const VT, const VT & > RightOperand
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:221
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:335
ResultType_t< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:132
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDVecMultExpr.h:359
CompositeType_t< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:135
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: TDMatDVecMultExpr.h:392
MultTrait_t< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:210
If_t< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDVecMultExpr.h:224
If_t< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:227
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:141
ResultType_t< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:131
CompositeType_t< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:136
ElementType_t< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:133
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:214
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:211
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDMatDVecMultExpr.h:315
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseVector base class.
Header file for the MatVecMultExpr base class.
Header file for the VecScalarMultExpr base class.
Header file for BLAS general matrix/vector multiplication functions (gemv)
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).
Definition: BLAS.h:169
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1339
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1375
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.
Definition: MatMatMultExpr.h:83
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.
Definition: MatVecMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.
Definition: ColumnVector.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: ColumnMajorMatrix.h:61
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.
Definition: MultTrait.h:165
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:137
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:221
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector.
Definition: DenseVector.h:192
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
Header file for the exception macros of the math module.
Header file for all forward declarations for expression class templates.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base class for all matrix/vector multiplication expression templates.
Definition: MatVecMultExpr.h:69
System settings for the BLAS mode.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.
Header file for the generic max algorithm.
Header file for the generic min algorithm.