real_sse_4hv_template.c 65.9 KB
Newer Older
1 2 3 4 5 6
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
7
//	Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9
//	Informatik,
10
//    - Technische Universität München, Lehrstuhl für Informatik mit
11
//	Schwerpunkt Wissenschaftliches Rechnen ,
12 13
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 15
//	Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//	and
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
36
//    along with ELPA.	If not, see <http://www.gnu.org/licenses/>
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------

#include "config-f90.h"

65 66 67 68 69 70 71 72 73
#ifdef HAVE_SSE_INTRINSICS
#include <x86intrin.h>
#endif
#ifdef HAVE_SPARC64_SSE
#include <fjmfunc.h>
#include <emmintrin.h>
#endif
#include <stdio.h>
#include <stdlib.h>
74

75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
#ifdef DOUBLE_PRECISION_REAL
#define offset 2
#define __SSE_DATATYPE __m128d
#define _SSE_LOAD _mm_load_pd
#define _SSE_ADD _mm_add_pd
#define _SSE_SUB _mm_sub_pd
#define _SSE_MUL _mm_mul_pd
#define _SSE_STORE _mm_store_pd
#endif
#ifdef SINGLE_PRECISION_REAL
#define offset 4
#define __SSE_DATATYPE __m128
#define _SSE_LOAD _mm_load_ps
#define _SSE_ADD _mm_add_ps
#define _SSE_SUB _mm_sub_ps
#define _SSE_MUL _mm_mul_ps
#define _SSE_STORE _mm_store_ps
#endif

#define __forceinline __attribute__((always_inline)) static

#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif

//Forward declaration
101
#ifdef HAVE_SSE_INTRINSICS
102 103 104 105 106 107 108 109 110 111
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_4_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_6_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_8_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
#endif
112 113 114 115 116 117 118 119 120 121 122 123 124
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_4_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_6_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_8_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
__forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4);
#endif
#endif
125

126
#ifdef HAVE_SSE_INTRINSICS
127 128 129 130 131 132
#ifdef DOUBLE_PRECISION_REAL
void quad_hh_trafo_real_sse_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
133 134 135 136 137 138 139 140 141
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
void quad_hh_trafo_real_sparc64_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
void quad_hh_trafo_real_sparc64_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#endif
142 143 144 145 146

/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine quad_hh_trafo_real_sse_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
147 148 149 150 151
!f>				bind(C, name="quad_hh_trafo_real_sse_4hv_double")
!f>	use, intrinsic :: iso_c_binding
!f>	integer(kind=c_int)	:: pnb, pnq, pldq, pldh
!f>	type(c_ptr), value	:: q
!f>	real(kind=c_double)	:: hh(pnb,6)
152 153 154 155 156
!f>   end subroutine
!f> end interface
!f>#endif
*/

157 158 159 160 161 162 163 164 165 166 167 168 169 170
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f>   subroutine quad_hh_trafo_real_sparc64_4hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f>				bind(C, name="quad_hh_trafo_real_sparc64_4hv_double")
!f>	use, intrinsic :: iso_c_binding
!f>	integer(kind=c_int)	:: pnb, pnq, pldq, pldh
!f>	type(c_ptr), value	:: q
!f>	real(kind=c_double)	:: hh(pnb,6)
!f>   end subroutine
!f> end interface
!f>#endif
*/

171 172 173 174
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine quad_hh_trafo_real_sse_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
175 176 177 178 179
!f>				bind(C, name="quad_hh_trafo_real_sse_4hv_single")
!f>	use, intrinsic :: iso_c_binding
!f>	integer(kind=c_int)	:: pnb, pnq, pldq, pldh
!f>	type(c_ptr), value	:: q
!f>	real(kind=c_float)	:: hh(pnb,6)
180 181 182 183 184
!f>   end subroutine
!f> end interface
!f>#endif
*/

185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f>   subroutine quad_hh_trafo_real_sparc64_4hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>				bind(C, name="quad_hh_trafo_real_sparc64_4hv_single")
!f>	use, intrinsic :: iso_c_binding
!f>	integer(kind=c_int)	:: pnb, pnq, pldq, pldh
!f>	type(c_ptr), value	:: q
!f>	real(kind=c_float)	:: hh(pnb,6)
!f>   end subroutine
!f> end interface
!f>#endif
*/

#ifdef HAVE_SSE_INTRINSICS
200 201 202 203 204
#ifdef DOUBLE_PRECISION_REAL
void quad_hh_trafo_real_sse_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void quad_hh_trafo_real_sse_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
205 206 207 208 209 210 211 212 213 214
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
void quad_hh_trafo_real_sparc64_4hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void quad_hh_trafo_real_sparc64_4hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif

215 216 217 218 219 220 221 222
#endif

{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	int ldh = *pldh;
223 224 225 226
	int worked_on;


	worked_on = 0;
227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276

	// calculating scalar products to compute
	// 4 householder vectors simultaneously
#ifdef DOUBLE_PRECISION_REAL
	double s_1_2 = hh[(ldh)+1];
	double s_1_3 = hh[(ldh*2)+2];
	double s_2_3 = hh[(ldh*2)+1];
	double s_1_4 = hh[(ldh*3)+3];
	double s_2_4 = hh[(ldh*3)+2];
	double s_3_4 = hh[(ldh*3)+1];
#endif
#ifdef SINGLE_PRECISION_REAL
	float s_1_2 = hh[(ldh)+1];
	float s_1_3 = hh[(ldh*2)+2];
	float s_2_3 = hh[(ldh*2)+1];
	float s_1_4 = hh[(ldh*3)+3];
	float s_2_4 = hh[(ldh*3)+2];
	float s_3_4 = hh[(ldh*3)+1];
#endif
	// calculate scalar product of first and fourth householder Vector
	// loop counter = 2
	s_1_2 += hh[2-1] * hh[(2+ldh)];
	s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
	s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];

	// loop counter = 3
	s_1_2 += hh[3-1] * hh[(3+ldh)];
	s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
	s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];

	s_1_3 += hh[3-2] * hh[3+(ldh*2)];
	s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];

	#pragma ivdep
	for (i = 4; i < nb; i++)
	{
		s_1_2 += hh[i-1] * hh[(i+ldh)];
		s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
		s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];

		s_1_3 += hh[i-2] * hh[i+(ldh*2)];
		s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];

		s_1_4 += hh[i-3] * hh[i+(ldh*3)];
	}

	// Production level kernel calls with padding
#ifdef DOUBLE_PRECISION_REAL
	for (i = 0; i < nq-4; i+=6)
	{
277
#ifdef HAVE_SSE_INTRINSICS
278
		hh_trafo_kernel_6_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
279 280 281 282
#endif
#ifdef HAVE_SPARC64_SSE
		hh_trafo_kernel_6_SPARC64_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
283
		worked_on += 6;
284 285 286 287 288
	}
#endif
#ifdef SINGLE_PRECISION_REAL
	for (i = 0; i < nq-8; i+=12)
	{
289
#ifdef HAVE_SSE_INTRINSICS
290
		hh_trafo_kernel_12_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
291 292 293 294 295
#endif
#ifdef HAVE_SPARC64_SSE
		hh_trafo_kernel_12_SPARC64_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif

296
		worked_on += 12;
297 298 299 300 301 302 303

	}
#endif
	if (nq == i)
	{
		return;
	}
304 305 306 307

#ifdef DOUBLE_PRECISION_REAL
	if (nq-i ==4)
	{
308
#ifdef HAVE_SSE_INTRINSICS
309
		hh_trafo_kernel_4_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
310 311 312 313
#endif
#ifdef HAVE_SPARC64_SSE
		hh_trafo_kernel_4_SPARC64_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
314 315 316 317
		worked_on += 4;
	}
#endif

318 319 320
#ifdef SINGLE_PRECISION_REAL
	if (nq-i ==8)
	{
321
#ifdef HAVE_SSE_INTRINSICS
322
		hh_trafo_kernel_8_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
323 324 325 326
#endif
#ifdef HAVE_SPARC64_SSE
		hh_trafo_kernel_8_SPARC64_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
327
		worked_on += 8;
328 329
	}
#endif
330

331
#ifdef DOUBLE_PRECISION_REAL
332 333
	if (nq-i == 2)
	{
334
#ifdef HAVE_SSE_INTRINSICS
335
		hh_trafo_kernel_2_SSE_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
336 337 338 339 340
#endif
#ifdef HAVE_SPARC64_SSE
		hh_trafo_kernel_2_SPARC64_4hv_double(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif

341 342
		worked_on += 2;
	}
343
#endif
344

345
#ifdef SINGLE_PRECISION_REAL
346 347
	if (nq-i ==4)
	{
348
#ifdef HAVE_SSE_INTRINSICS
349
		hh_trafo_kernel_4_SSE_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
350 351 352 353
#endif
#ifdef HAVE_SPARC64_SSE
		hh_trafo_kernel_4_SPARC64_4hv_single(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4);
#endif
354 355
		worked_on += 4;
	}
356
#endif
357
#ifdef WITH_DEBUG
358
	if (worked_on != nq)
359
	{
360
#ifdef HAVE_SSE_INTRINSICS
361
		printf("Error in real SSE BLOCK4 kernel \n");
362 363 364 365 366
#endif
#ifdef HAVE_SPARC64_SSE
		printf("Error in real SPARC64 BLOCK4 kernel \n");
#endif

367
		abort();
368
	}
369
#endif
370

371 372 373 374 375 376 377 378 379 380 381 382 383
}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 6 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 12 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 1 update is performed
 */
384
#ifdef HAVE_SSE_INTRINSICS
385 386 387 388 389 390
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_6_SSE_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_SSE_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
#endif
391 392 393 394 395 396 397 398 399
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_6_SPARC64_4hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_12_SPARC64_4hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s_1_2, float s_1_3, float s_2_3, float s_1_4, float s_2_4, float s_3_4)
#endif
#endif
400 401 402 403 404 405 406 407 408 409 410 411
{
	/////////////////////////////////////////////////////
	// Matrix Vector Multiplication, Q [6 x nb+3] * hh
	// hh contains four householder vectors
	/////////////////////////////////////////////////////
	int i;

	__SSE_DATATYPE a1_1 = _SSE_LOAD(&q[ldq*3]);
	__SSE_DATATYPE a2_1 = _SSE_LOAD(&q[ldq*2]);
	__SSE_DATATYPE a3_1 = _SSE_LOAD(&q[ldq]);
	__SSE_DATATYPE a4_1 = _SSE_LOAD(&q[0]);

412
#ifdef HAVE_SSE_INTRINSICS
413
#ifdef DOUBLE_PRECISION_REAL
414 415 416 417 418 419
	__SSE_DATATYPE h_2_1 = _mm_set1_pd(hh[ldh+1]);
	__SSE_DATATYPE h_3_2 = _mm_set1_pd(hh[(ldh*2)+1]);
	__SSE_DATATYPE h_3_1 = _mm_set1_pd(hh[(ldh*2)+2]);
	__SSE_DATATYPE h_4_3 = _mm_set1_pd(hh[(ldh*3)+1]);
	__SSE_DATATYPE h_4_2 = _mm_set1_pd(hh[(ldh*3)+2]);
	__SSE_DATATYPE h_4_1 = _mm_set1_pd(hh[(ldh*3)+3]);
420 421 422
#endif

#ifdef SINGLE_PRECISION_REAL
423 424 425 426 427 428
	__m128 h_2_1 = _mm_set1_ps(hh[ldh+1] ); // h_2_1 contains four times hh[ldh+1]
	__m128 h_3_2 = _mm_set1_ps(hh[(ldh*2)+1]);
	__m128 h_3_1 = _mm_set1_ps(hh[(ldh*2)+2]);
	__m128 h_4_3 = _mm_set1_ps(hh[(ldh*3)+1]);
	__m128 h_4_2 = _mm_set1_ps(hh[(ldh*3)+2]);
	__m128 h_4_1 = _mm_set1_ps(hh[(ldh*3)+3]);
429
#endif
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	__SSE_DATATYPE h_2_1 = _mm_set_pd(hh[ldh+1], hh[ldh+1]);
	__SSE_DATATYPE h_3_2 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]);
	__SSE_DATATYPE h_3_1 = _mm_set_pd(hh[(ldh*2)+2], hh[(ldh*2)+2]);
	__SSE_DATATYPE h_4_3 = _mm_set_pd(hh[(ldh*3)+1], hh[(ldh*3)+1]);
	__SSE_DATATYPE h_4_2 = _mm_set_pd(hh[(ldh*3)+2], hh[(ldh*3)+2]);
	__SSE_DATATYPE h_4_1 = _mm_set_pd(hh[(ldh*3)+3], hh[(ldh*3)+3]);
#endif

#ifdef SINGLE_PRECISION_REAL
	__m128 h_2_1 = _mm_set_ps(hh[ldh+1],  hh[ldh+1]); // h_2_1 contains four times hh[ldh+1]
	__m128 h_3_2 = _mm_set_ps(hh[(ldh*2)+1], hh[(ldh*2)+1]);
	__m128 h_3_1 = _mm_set_ps(hh[(ldh*2)+2], hh[(ldh*2)+2]);
	__m128 h_4_3 = _mm_set_ps(hh[(ldh*3)+1], hh[(ldh*3)+1]);
	__m128 h_4_2 = _mm_set_ps(hh[(ldh*3)+2], hh[(ldh*3)+2]);
	__m128 h_4_1 = _mm_set_ps(hh[(ldh*3)+3], hh[(ldh*3)+3]);
#endif
#endif
451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498



	register __SSE_DATATYPE w1 = _SSE_ADD(a4_1, _SSE_MUL(a3_1, h_4_3));
	w1 = _SSE_ADD(w1, _SSE_MUL(a2_1, h_4_2));
	w1 = _SSE_ADD(w1, _SSE_MUL(a1_1, h_4_1));
	register __SSE_DATATYPE z1 = _SSE_ADD(a3_1, _SSE_MUL(a2_1, h_3_2));
	z1 = _SSE_ADD(z1, _SSE_MUL(a1_1, h_3_1));
	register __SSE_DATATYPE y1 = _SSE_ADD(a2_1, _SSE_MUL(a1_1, h_2_1));
	register __SSE_DATATYPE x1 = a1_1;

	__SSE_DATATYPE a1_2 = _SSE_LOAD(&q[(ldq*3)+offset]);
	__SSE_DATATYPE a2_2 = _SSE_LOAD(&q[(ldq*2)+offset]);
	__SSE_DATATYPE a3_2 = _SSE_LOAD(&q[ldq+offset]);
	__SSE_DATATYPE a4_2 = _SSE_LOAD(&q[0+offset]);

	register __SSE_DATATYPE w2 = _SSE_ADD(a4_2, _SSE_MUL(a3_2, h_4_3));
	w2 = _SSE_ADD(w2, _SSE_MUL(a2_2, h_4_2));
	w2 = _SSE_ADD(w2, _SSE_MUL(a1_2, h_4_1));
	register __SSE_DATATYPE z2 = _SSE_ADD(a3_2, _SSE_MUL(a2_2, h_3_2));
	z2 = _SSE_ADD(z2, _SSE_MUL(a1_2, h_3_1));
	register __SSE_DATATYPE y2 = _SSE_ADD(a2_2, _SSE_MUL(a1_2, h_2_1));
	register __SSE_DATATYPE x2 = a1_2;

	__SSE_DATATYPE a1_3 = _SSE_LOAD(&q[(ldq*3)+2*offset]);
	__SSE_DATATYPE a2_3 = _SSE_LOAD(&q[(ldq*2)+2*offset]);
	__SSE_DATATYPE a3_3 = _SSE_LOAD(&q[ldq+2*offset]);
	__SSE_DATATYPE a4_3 = _SSE_LOAD(&q[0+2*offset]);

	register __SSE_DATATYPE w3 = _SSE_ADD(a4_3, _SSE_MUL(a3_3, h_4_3));
	w3 = _SSE_ADD(w3, _SSE_MUL(a2_3, h_4_2));
	w3 = _SSE_ADD(w3, _SSE_MUL(a1_3, h_4_1));
	register __SSE_DATATYPE z3 = _SSE_ADD(a3_3, _SSE_MUL(a2_3, h_3_2));
	z3 = _SSE_ADD(z3, _SSE_MUL(a1_3, h_3_1));
	register __SSE_DATATYPE y3 = _SSE_ADD(a2_3, _SSE_MUL(a1_3, h_2_1));
	register __SSE_DATATYPE x3 = a1_3;

	__SSE_DATATYPE q1;
	__SSE_DATATYPE q2;
	__SSE_DATATYPE q3;

	__SSE_DATATYPE h1;
	__SSE_DATATYPE h2;
	__SSE_DATATYPE h3;
	__SSE_DATATYPE h4;

	for(i = 4; i < nb; i++)
	{
499
#ifdef HAVE_SSE_INTRINSICS
500
#ifdef DOUBLE_PRECISION_REAL
501
		h1 = _mm_set1_pd(hh[i-3]);
502 503
#endif
#ifdef SINGLE_PRECISION_REAL
504
		h1 = _mm_set1_ps(hh[i-3]);
505 506 507 508 509 510 511 512 513 514
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
		h1 = _mm_set_pd(hh[i-3], hh[i-3]);
#endif
#ifdef SINGLE_PRECISION_REAL
		h1 = _mm_set_ps(hh[i-3], hh[i-3]);
#endif
515 516 517 518 519 520 521 522 523
#endif
		q1 = _SSE_LOAD(&q[i*ldq]);
		q2 = _SSE_LOAD(&q[(i*ldq)+offset]);
		q3 = _SSE_LOAD(&q[(i*ldq)+2*offset]);

		x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
		x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
		x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));

524
#ifdef HAVE_SSE_INTRINSICS
525
#ifdef DOUBLE_PRECISION_REAL
526
		h2 = _mm_set1_pd(hh[ldh+i-2]);
527 528
#endif
#ifdef SINGLE_PRECISION_REAL
529
		h2 = _mm_set1_ps(hh[ldh+i-2]);
530 531 532 533 534 535 536 537 538 539 540
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
		h2 = _mm_set_pd(hh[ldh+i-2], hh[ldh+i-2]);
#endif
#ifdef SINGLE_PRECISION_REAL
		h2 = _mm_set_ps(hh[ldh+i-2], hh[ldh+i-2]);
#endif

541 542 543 544 545
#endif
		y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
		y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
		y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));

546
#ifdef HAVE_SSE_INTRINSICS
547
#ifdef DOUBLE_PRECISION_REAL
548
		h3 = _mm_set1_pd(hh[(ldh*2)+i-1]);
549 550
#endif
#ifdef SINGLE_PRECISION_REAL
551
		h3 = _mm_set1_ps(hh[(ldh*2)+i-1]);
552
#endif
553 554 555 556 557 558 559 560 561 562 563
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
		h3 = _mm_set_pd(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
		h3 = _mm_set_ps(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]);
#endif
#endif

564 565 566 567

		z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
		z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
		z3 = _SSE_ADD(z3, _SSE_MUL(q3,h3));
568
#ifdef HAVE_SSE_INTRINSICS
569
#ifdef DOUBLE_PRECISION_REAL
570
		h4 = _mm_set1_pd(hh[(ldh*3)+i]);
571 572
#endif
#ifdef SINGLE_PRECISION_REAL
573
		h4 = _mm_set1_ps(hh[(ldh*3)+i]);
574
#endif
575 576 577 578 579 580 581 582 583 584 585
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
		h4 = _mm_set_pd(hh[(ldh*3)+i], hh[(ldh*3)+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
		h4 = _mm_set_ps(hh[(ldh*3)+i], hh[(ldh*3)+i]);
#endif
#endif

586 587 588 589 590
		w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));
		w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));
		w3 = _SSE_ADD(w3, _SSE_MUL(q3,h4));
	}

591
#ifdef HAVE_SSE_INTRINSICS
592
#ifdef DOUBLE_PRECISION_REAL
593
	h1 = _mm_set1_pd(hh[nb-3]);
594 595
#endif
#ifdef SINGLE_PRECISION_REAL
596
	h1 = _mm_set1_ps(hh[nb-3]);
597
#endif
598
#endif
599

600 601 602 603 604 605 606 607
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h1 = _mm_set_pd(hh[nb-3], hh[nb-3]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h1 = _mm_set_ps(hh[nb-3], hh[nb-3]);
#endif
#endif
608 609 610 611 612 613 614 615
	q1 = _SSE_LOAD(&q[nb*ldq]);
	q2 = _SSE_LOAD(&q[(nb*ldq)+offset]);
	q3 = _SSE_LOAD(&q[(nb*ldq)+2*offset]);

	x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
	x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
	x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));

616
#ifdef HAVE_SSE_INTRINSICS
617
#ifdef DOUBLE_PRECISION_REAL
618
	h2 = _mm_set1_pd(hh[ldh+nb-2]);
619 620
#endif
#ifdef SINGLE_PRECISION_REAL
621
	h2 = _mm_set1_ps(hh[ldh+nb-2]);
622 623 624 625 626 627 628 629 630 631
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h2 = _mm_set_pd(hh[ldh+nb-2], hh[ldh+nb-2]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h2 = _mm_set_ps(hh[ldh+nb-2], hh[ldh+nb-2]);
#endif
632 633 634 635 636 637
#endif

	y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
	y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
	y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));

638
#ifdef HAVE_SSE_INTRINSICS
639
#ifdef DOUBLE_PRECISION_REAL
640
	h3 = _mm_set1_pd(hh[(ldh*2)+nb-1]);
641 642
#endif
#ifdef SINGLE_PRECISION_REAL
643
	h3 = _mm_set1_ps(hh[(ldh*2)+nb-1]);
644
#endif
645 646 647 648 649 650 651 652 653 654 655
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h3 = _mm_set_pd(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h3 = _mm_set_ps(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
#endif
#endif

656 657 658 659
	z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
	z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
	z3 = _SSE_ADD(z3, _SSE_MUL(q3,h3));

660
#ifdef HAVE_SSE_INTRINSICS
661
#ifdef DOUBLE_PRECISION_REAL
662
	h1 = _mm_set1_pd(hh[nb-2]);
663 664
#endif
#ifdef SINGLE_PRECISION_REAL
665
	h1 = _mm_set1_ps(hh[nb-2]);
666
#endif
667 668 669 670 671 672 673 674 675 676 677
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h1 = _mm_set_pd(hh[nb-2], hh[nb-2]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h1 = _mm_set_ps(hh[nb-2], hh[nb-2]);
#endif
#endif

678 679 680 681 682 683 684 685
	q1 = _SSE_LOAD(&q[(nb+1)*ldq]);
	q2 = _SSE_LOAD(&q[((nb+1)*ldq)+offset]);
	q3 = _SSE_LOAD(&q[((nb+1)*ldq)+2*offset]);

	x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
	x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
	x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));

686
#ifdef HAVE_SSE_INTRINSICS
687
#ifdef DOUBLE_PRECISION_REAL
688
	h2 = _mm_set1_pd(hh[(ldh*1)+nb-1]);
689 690
#endif
#ifdef SINGLE_PRECISION_REAL
691
	h2 = _mm_set1_ps(hh[ldh+nb-1]);
692
#endif
693 694 695 696 697 698 699 700 701 702 703 704
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h2 = _mm_set_pd(hh[(ldh*1)+nb-1], hh[(ldh*1)+nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h2 = _mm_set_ps(hh[ldh+nb-1], hh[(ldh*1)+nb-1]);
#endif
#endif


705 706 707 708
	y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
	y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
	y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));

709
#ifdef HAVE_SSE_INTRINSICS
710
#ifdef DOUBLE_PRECISION_REAL
711
	h1 = _mm_set1_pd(hh[nb-1]);
712 713
#endif
#ifdef SINGLE_PRECISION_REAL
714
	h1 = _mm_set1_ps(hh[nb-1]);
715
#endif
716 717 718 719 720 721 722 723 724 725 726 727
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h1 = _mm_set_pd(hh[nb-1], hh[nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h1 = _mm_set_ps(hh[nb-1], hh[nb-1]);
#endif
#endif


728 729 730 731 732 733 734 735 736 737 738 739
	q1 = _SSE_LOAD(&q[(nb+2)*ldq]);
	q2 = _SSE_LOAD(&q[((nb+2)*ldq)+offset]);
	q3 = _SSE_LOAD(&q[((nb+2)*ldq)+2*offset]);

	x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
	x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
	x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));

	/////////////////////////////////////////////////////
	// Rank-1 update of Q [6 x nb+3]
	/////////////////////////////////////////////////////

740
#ifdef HAVE_SSE_INTRINSICS
741
#ifdef DOUBLE_PRECISION_REAL
742
	__SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]);
743 744
#endif
#ifdef SINGLE_PRECISION_REAL
745
       __m128 tau1 = _mm_set1_ps(hh[0]);
746
#endif
747 748 749 750 751 752 753 754 755 756 757
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	__SSE_DATATYPE tau1 = _mm_set_pd(hh[0], hh[0]);
#endif
#ifdef SINGLE_PRECISION_REAL
       __m128 tau1 = _mm_set_ps(hh[0], hh[0]);
#endif
#endif

758 759 760 761

	h1 = tau1;
	x1 = _SSE_MUL(x1, h1);
	x2 = _SSE_MUL(x2, h1);
762
	x3 = _SSE_MUL(x3, h1)
763

764
#ifdef HAVE_SSE_INTRINSICS
765
#ifdef DOUBLE_PRECISION_REAL
766 767
	__SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]);
	__SSE_DATATYPE vs_1_2 = _mm_set1_pd(s_1_2);
768 769
#endif
#ifdef SINGLE_PRECISION_REAL
770 771
	__m128 tau2 = _mm_set1_ps(hh[ldh]);
	__m128 vs_1_2 = _mm_set1_ps(s_1_2);
772
#endif
773 774 775 776 777 778 779 780 781 782 783 784 785
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	__SSE_DATATYPE tau2 = _mm_set_pd(hh[ldh], hh[ldh]);
	__SSE_DATATYPE vs_1_2 = _mm_set_pd(s_1_2, s_1_2);
#endif
#ifdef SINGLE_PRECISION_REAL
	__m128 tau2 = _mm_set_ps(hh[ldh], hh[ldh]);
	__m128 vs_1_2 = _mm_set_ps(s_1_2, s_1_2);
#endif
#endif

786 787 788 789 790 791 792 793

	h1 = tau2;
	h2 = _SSE_MUL(h1, vs_1_2);

	y1 = _SSE_SUB(_SSE_MUL(y1,h1), _SSE_MUL(x1,h2));
	y2 = _SSE_SUB(_SSE_MUL(y2,h1), _SSE_MUL(x2,h2));
	y3 = _SSE_SUB(_SSE_MUL(y3,h1), _SSE_MUL(x3,h2));

794
#ifdef HAVE_SSE_INTRINSICS
795
#ifdef DOUBLE_PRECISION_REAL
796 797 798
	__SSE_DATATYPE tau3 = _mm_set1_pd(hh[ldh*2]);
	__SSE_DATATYPE vs_1_3 = _mm_set1_pd(s_1_3);
	__SSE_DATATYPE vs_2_3 = _mm_set1_pd(s_2_3);
799 800
#endif
#ifdef SINGLE_PRECISION_REAL
801 802 803
	__m128 tau3 = _mm_set1_ps(hh[ldh*2]);
	__m128 vs_1_3 = _mm_set1_ps(s_1_3);
	__m128 vs_2_3 = _mm_set1_ps(s_2_3);
804
#endif
805 806 807 808 809 810 811 812 813 814 815 816 817 818 819
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	__SSE_DATATYPE tau3 = _mm_set_pd(hh[ldh*2], hh[ldh*2]);
	__SSE_DATATYPE vs_1_3 = _mm_set_pd(s_1_3, s_1_3);
	__SSE_DATATYPE vs_2_3 = _mm_set_pd(s_2_3, s_2_3);
#endif
#ifdef SINGLE_PRECISION_REAL
	__m128 tau3 = _mm_set_ps(hh[ldh*2], hh[ldh*2]);
	__m128 vs_1_3 = _mm_set_ps(s_1_3, s_1_3);
	__m128 vs_2_3 = _mm_set_ps(s_2_3, s_2_3);
#endif
#endif

820 821 822 823 824 825 826 827 828

	h1 = tau3;
	h2 = _SSE_MUL(h1, vs_1_3);
	h3 = _SSE_MUL(h1, vs_2_3);

	z1 = _SSE_SUB(_SSE_MUL(z1,h1), _SSE_ADD(_SSE_MUL(y1,h3), _SSE_MUL(x1,h2)));
	z2 = _SSE_SUB(_SSE_MUL(z2,h1), _SSE_ADD(_SSE_MUL(y2,h3), _SSE_MUL(x2,h2)));
	z3 = _SSE_SUB(_SSE_MUL(z3,h1), _SSE_ADD(_SSE_MUL(y3,h3), _SSE_MUL(x3,h2)));

829
#ifdef HAVE_SSE_INTRINSICS
830
#ifdef DOUBLE_PRECISION_REAL
831 832 833 834
	__SSE_DATATYPE tau4 = _mm_set1_pd(hh[ldh*3]);
	__SSE_DATATYPE vs_1_4 = _mm_set1_pd(s_1_4);
	__SSE_DATATYPE vs_2_4 = _mm_set1_pd(s_2_4);
	__SSE_DATATYPE vs_3_4 = _mm_set1_pd(s_3_4);
835 836
#endif
#ifdef SINGLE_PRECISION_REAL
837 838 839 840
	__m128 tau4 = _mm_set1_ps(hh[ldh*3]);
	__m128 vs_1_4 = _mm_set1_ps(s_1_4);
	__m128 vs_2_4 = _mm_set1_ps(s_2_4);
	__m128 vs_3_4 = _mm_set1_ps(s_3_4);
841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	__SSE_DATATYPE tau4 = _mm_set_pd(hh[ldh*3], hh[ldh*3]);
	__SSE_DATATYPE vs_1_4 = _mm_set_pd(s_1_4, s_1_4);
	__SSE_DATATYPE vs_2_4 = _mm_set_pd(s_2_4, s_2_4);
	__SSE_DATATYPE vs_3_4 = _mm_set_pd(s_3_4, s_3_4);
#endif
#ifdef SINGLE_PRECISION_REAL
	__m128 tau4 = _mm_set_ps(hh[ldh*3], hh[ldh*3]);
	__m128 vs_1_4 = _mm_set_ps(s_1_4, s_1_4);
	__m128 vs_2_4 = _mm_set_ps(s_2_4, s_2_4);
	__m128 vs_3_4 = _mm_set_ps(s_3_4, s_3_4);
#endif
857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877
#endif

	h1 = tau4;
	h2 = _SSE_MUL(h1, vs_1_4);
	h3 = _SSE_MUL(h1, vs_2_4);
	h4 = _SSE_MUL(h1, vs_3_4);

	w1 = _SSE_SUB(_SSE_MUL(w1,h1), _SSE_ADD(_SSE_MUL(z1,h4), _SSE_ADD(_SSE_MUL(y1,h3), _SSE_MUL(x1,h2))));
	w2 = _SSE_SUB(_SSE_MUL(w2,h1), _SSE_ADD(_SSE_MUL(z2,h4), _SSE_ADD(_SSE_MUL(y2,h3), _SSE_MUL(x2,h2))));
	w3 = _SSE_SUB(_SSE_MUL(w3,h1), _SSE_ADD(_SSE_MUL(z3,h4), _SSE_ADD(_SSE_MUL(y3,h3), _SSE_MUL(x3,h2))));

	q1 = _SSE_LOAD(&q[0]);
	q2 = _SSE_LOAD(&q[offset]);
	q3 = _SSE_LOAD(&q[2*offset]);
	q1 = _SSE_SUB(q1, w1);
	q2 = _SSE_SUB(q2, w2);
	q3 = _SSE_SUB(q3, w3);
	_SSE_STORE(&q[0],q1);
	_SSE_STORE(&q[offset],q2);
	_SSE_STORE(&q[2*offset],q3);

878
#ifdef HAVE_SSE_INTRINSICS
879
#ifdef DOUBLE_PRECISION_REAL
880
	h4 = _mm_set1_pd(hh[(ldh*3)+1]);
881 882
#endif
#ifdef SINGLE_PRECISION_REAL
883
	h4 = _mm_set1_ps(hh[(ldh*3)+1]);
884
#endif
885 886 887 888 889 890 891 892 893 894 895
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h4 = _mm_set_pd(hh[(ldh*3)+1], hh[(ldh*3)+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h4 = _mm_set_ps(hh[(ldh*3)+1], hh[(ldh*3)+1]);
#endif
#endif

896 897 898 899 900 901 902 903 904 905 906 907 908

	q1 = _SSE_LOAD(&q[ldq]);
	q2 = _SSE_LOAD(&q[ldq+offset]);
	q3 = _SSE_LOAD(&q[ldq+2*offset]);

	q1 = _SSE_SUB(q1, _SSE_ADD(z1, _SSE_MUL(w1, h4)));
	q2 = _SSE_SUB(q2, _SSE_ADD(z2, _SSE_MUL(w2, h4)));
	q3 = _SSE_SUB(q3, _SSE_ADD(z3, _SSE_MUL(w3, h4)));

	_SSE_STORE(&q[ldq],q1);
	_SSE_STORE(&q[ldq+offset],q2);
	_SSE_STORE(&q[ldq+2*offset],q3);

909
#ifdef HAVE_SSE_INTRINSICS
910
#ifdef DOUBLE_PRECISION_REAL
911
	h4 = _mm_set1_pd(hh[(ldh*3)+2]);
912 913
#endif
#ifdef SINGLE_PRECISION_REAL
914
	h4 = _mm_set1_ps(hh[(ldh*3)+2]);
915
#endif
916 917 918 919 920 921 922 923 924 925 926
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h4 = _mm_set_pd(hh[(ldh*3)+2], hh[(ldh*3)+2]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h4 = _mm_set_ps(hh[(ldh*3)+2], hh[(ldh*3)+2]);
#endif
#endif

927 928 929 930 931 932 933 934 935 936 937
	q1 = _SSE_LOAD(&q[ldq*2]);
	q2 = _SSE_LOAD(&q[(ldq*2)+offset]);
	q3 = _SSE_LOAD(&q[(ldq*2)+2*offset]);
	q1 = _SSE_SUB(q1, y1);
	q2 = _SSE_SUB(q2, y2);
	q3 = _SSE_SUB(q3, y3);

	q1 = _SSE_SUB(q1, _SSE_MUL(w1, h4));
	q2 = _SSE_SUB(q2, _SSE_MUL(w2, h4));
	q3 = _SSE_SUB(q3, _SSE_MUL(w3, h4));

938
#ifdef HAVE_SSE_INTRINSICS
939
#ifdef DOUBLE_PRECISION_REAL
940
	h3 = _mm_set1_pd(hh[(ldh*2)+1]);
941 942
#endif
#ifdef SINGLE_PRECISION_REAL
943
	h3 = _mm_set1_ps(hh[(ldh*2)+1]);
944
#endif
945 946 947 948 949 950 951 952 953 954 955
#endif

#ifdef HAVE_SPARC64_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
	h3 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h3 = _mm_set_ps(hh[(ldh*2)+1], hh[(ldh*2)+1]);
#endif
#endif

956 957 958 959 960 961 962 963
	q1 = _SSE_SUB(q1, _SSE_MUL(z1, h3));
	q2 = _SSE_SUB(q2, _SSE_MUL(z2, h3));
	q3 = _SSE_SUB(q3, _SSE_MUL(z3, h3));

	_SSE_STORE(&q[ldq*2],q1);
	_SSE_STORE(&q[(ldq*2)+offset],q2);
	_SSE_STORE(&q[(ldq*2)+2*offset],q3);

964
#ifdef HAVE_SSE_INTRINSICS
965
#ifdef DOUBLE_PRECISION_REAL
966
	h4 = _mm_set1_pd(hh[(ldh*3)+3]);
967 968
#endif
#ifdef SINGLE_PRECISION_REAL
969
	h4 = _mm_set1_ps(hh[(ldh*3)+3]);
970
#endif
971 972 973 974 975 976 977 978 979 980 981
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h4 = _mm_set_pd(hh[(ldh*3)+3], hh[(ldh*3)+3]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h4 = _mm_set_ps(hh[(ldh*3)+3], hh[(ldh*3)+3]);
#endif
#endif

982 983 984 985 986 987 988 989 990 991 992
	q1 = _SSE_LOAD(&q[ldq*3]);
	q2 = _SSE_LOAD(&q[(ldq*3)+offset]);
	q3 = _SSE_LOAD(&q[(ldq*3)+2*offset]);
	q1 = _SSE_SUB(q1, x1);
	q2 = _SSE_SUB(q2, x2);
	q3 = _SSE_SUB(q3, x3);

	q1 = _SSE_SUB(q1, _SSE_MUL(w1, h4));
	q2 = _SSE_SUB(q2, _SSE_MUL(w2, h4));
	q3 = _SSE_SUB(q3, _SSE_MUL(w3, h4));

993
#ifdef HAVE_SSE_INTRINSICS
994
#ifdef DOUBLE_PRECISION_REAL
995
	h2 = _mm_set1_pd(hh[ldh+1]);
996 997
#endif
#ifdef SINGLE_PRECISION_REAL
998
	h2 = _mm_set1_ps(hh[ldh+1]);
999
#endif
1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h2 = _mm_set_pd(hh[ldh+1], hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h2 = _mm_set_ps(hh[ldh+1], hh[ldh+1]);
#endif
#endif

1011 1012 1013 1014 1015

	q1 = _SSE_SUB(q1, _SSE_MUL(y1, h2));
	q2 = _SSE_SUB(q2, _SSE_MUL(y2, h2));
	q3 = _SSE_SUB(q3, _SSE_MUL(y3, h2));

1016
#ifdef HAVE_SSE_INTRINSICS
1017
#ifdef DOUBLE_PRECISION_REAL
1018
	h3 = _mm_set1_pd(hh[(ldh*2)+2]);
1019 1020
#endif
#ifdef SINGLE_PRECISION_REAL
1021
	h3 = _mm_set1_ps(hh[(ldh*2)+2]);
1022
#endif
1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h3 = _mm_set_pd(hh[(ldh*2)+2], hh[(ldh*2)+2]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h3 = _mm_set_ps(hh[(ldh*2)+2], hh[(ldh*2)+2]);
#endif
#endif

1034 1035 1036 1037 1038 1039 1040 1041 1042 1043

	q1 = _SSE_SUB(q1, _SSE_MUL(z1, h3));
	q2 = _SSE_SUB(q2, _SSE_MUL(z2, h3));
	q3 = _SSE_SUB(q3, _SSE_MUL(z3, h3));
	_SSE_STORE(&q[ldq*3], q1);
	_SSE_STORE(&q[(ldq*3)+offset], q2);
	_SSE_STORE(&q[(ldq*3)+2*offset], q3);

	for (i = 4; i < nb; i++)
	{
1044
#ifdef HAVE_SSE_INTRINSICS
1045
#ifdef DOUBLE_PRECISION_REAL
1046
		h1 = _mm_set1_pd(hh[i-3]);
1047 1048
#endif
#ifdef SINGLE_PRECISION_REAL
1049
		h1 = _mm_set1_ps(hh[i-3]);
1050 1051 1052 1053 1054 1055 1056 1057 1058 1059
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
		h1 = _mm_set_pd(hh[i-3], hh[i-3]);
#endif
#ifdef SINGLE_PRECISION_REAL
		h1 = _mm_set_ps(hh[i-3], hh[i-3]);
#endif
1060 1061 1062 1063 1064 1065 1066 1067 1068 1069
#endif

		q1 = _SSE_LOAD(&q[i*ldq]);
		q2 = _SSE_LOAD(&q[(i*ldq)+offset]);
		q3 = _SSE_LOAD(&q[(i*ldq)+2*offset]);

		q1 = _SSE_SUB(q1, _SSE_MUL(x1,h1));
		q2 = _SSE_SUB(q2, _SSE_MUL(x2,h1));
		q3 = _SSE_SUB(q3, _SSE_MUL(x3,h1));

1070
#ifdef HAVE_SSE_INTRINSICS
1071
#ifdef DOUBLE_PRECISION_REAL
1072
		h2 = _mm_set1_pd(hh[ldh+i-2]);
1073 1074
#endif
#ifdef SINGLE_PRECISION_REAL
1075
		h2 = _mm_set1_ps(hh[ldh+i-2]);
1076 1077 1078 1079 1080 1081 1082 1083 1084 1085
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
		h2 = _mm_set_pd(hh[ldh+i-2], hh[ldh+i-2]);
#endif
#ifdef SINGLE_PRECISION_REAL
		h2 = _mm_set_ps(hh[ldh+i-2], hh[ldh+i-2]);
#endif
1086 1087 1088 1089 1090 1091
#endif

		q1 = _SSE_SUB(q1, _SSE_MUL(y1,h2));
		q2 = _SSE_SUB(q2, _SSE_MUL(y2,h2));
		q3 = _SSE_SUB(q3, _SSE_MUL(y3,h2));

1092
#ifdef HAVE_SSE_INTRINSICS
1093
#ifdef DOUBLE_PRECISION_REAL
1094
		h3 = _mm_set1_pd(hh[(ldh*2)+i-1]);
1095 1096
#endif
#ifdef SINGLE_PRECISION_REAL
1097
		h3 = _mm_set1_ps(hh[(ldh*2)+i-1]);
1098
#endif
1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
		h3 = _mm_set_pd(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
		h3 = _mm_set_ps(hh[(ldh*2)+i-1], hh[(ldh*2)+i-1]);
#endif
#endif

1110 1111 1112 1113
		q1 = _SSE_SUB(q1, _SSE_MUL(z1,h3));
		q2 = _SSE_SUB(q2, _SSE_MUL(z2,h3));
		q3 = _SSE_SUB(q3, _SSE_MUL(z3,h3));

1114
#ifdef HAVE_SSE_INTRINSICS
1115
#ifdef DOUBLE_PRECISION_REAL
1116
		h4 = _mm_set1_pd(hh[(ldh*3)+i]);
1117 1118
#endif
#ifdef SINGLE_PRECISION_REAL
1119
		h4 = _mm_set1_ps(hh[(ldh*3)+i]);
1120
#endif
1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
#endif

#ifdef HAVE_SPRC64_SSE
#ifdef DOUBLE_PRECISION_REAL
		h4 = _mm_set_pd(hh[(ldh*3)+i], hh[(ldh*3)+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
		h4 = _mm_set_ps(hh[(ldh*3)+i], hh[(ldh*3)+i]);
#endif
#endif

1132 1133 1134 1135 1136 1137 1138 1139 1140
		q1 = _SSE_SUB(q1, _SSE_MUL(w1,h4));
		q2 = _SSE_SUB(q2, _SSE_MUL(w2,h4));
		q3 = _SSE_SUB(q3, _SSE_MUL(w3,h4));

		_SSE_STORE(&q[i*ldq],q1);
		_SSE_STORE(&q[(i*ldq)+offset],q2);
		_SSE_STORE(&q[(i*ldq)+2*offset],q3);
	}

1141
#ifdef HAVE_SSE_INTRINSICS
1142
#ifdef DOUBLE_PRECISION_REAL
1143
	h1 = _mm_set1_pd(hh[nb-3]);
1144 1145
#endif
#ifdef SINGLE_PRECISION_REAL
1146
	h1 = _mm_set1_ps(hh[nb-3]);
1147
#endif
1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h1 = _mm_set_pd(hh[nb-3], hh[nb-3]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h1 = _mm_set_ps(hh[nb-3], hh[nb-3]);
#endif
#endif

1159 1160 1161 1162 1163 1164 1165 1166
	q1 = _SSE_LOAD(&q[nb*ldq]);
	q2 = _SSE_LOAD(&q[(nb*ldq)+offset]);
	q3 = _SSE_LOAD(&q[(nb*ldq)+2*offset]);

	q1 = _SSE_SUB(q1, _SSE_MUL(x1, h1));
	q2 = _SSE_SUB(q2, _SSE_MUL(x2, h1));
	q3 = _SSE_SUB(q3, _SSE_MUL(x3, h1));

1167
#ifdef HAVE_SSE_INTRINSCS
1168
#ifdef DOUBLE_PRECISION_REAL
1169
	h2 = _mm_set1_pd(hh[ldh+nb-2]);
1170 1171
#endif
#ifdef SINGLE_PRECISION_REAL
1172
	h2 = _mm_set1_ps(hh[ldh+nb-2]);
1173
#endif
1174 1175 1176 1177 1178 1179 1180 1181 1182 1183
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h2 = _mm_set_pd(hh[ldh+nb-2], hh[ldh+nb-2]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h2 = _mm_set_ps(hh[ldh+nb-2], hh[ldh+nb-2]);
#endif
#endif
1184 1185 1186 1187 1188 1189


	q1 = _SSE_SUB(q1, _SSE_MUL(y1, h2));
	q2 = _SSE_SUB(q2, _SSE_MUL(y2, h2));
	q3 = _SSE_SUB(q3, _SSE_MUL(y3, h2));

1190
#ifdef HAVE_SSE_INTRINSICS
1191
#ifdef DOUBLE_PRECISION_REAL
1192
	h3 = _mm_set1_pd(hh[(ldh*2)+nb-1]);
1193 1194
#endif
#ifdef SINGLE_PRECISION_REAL
1195
	h3 = _mm_set1_ps(hh[(ldh*2)+nb-1]);
1196
#endif
1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
	h3 = _mm_set_pd(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
#endif
#ifdef SINGLE_PRECISION_REAL
	h3 = _mm_set_ps(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
#endif
#endif

1208 1209 1210 1211 1212 1213 1214 1215 1216

	q1 = _SSE_SUB(q1, _SSE_MUL(z1, h3));
	q2 = _SSE_SUB(q2, _SSE_MUL(z2, h3));
	q3 = _SSE_SUB(q3, _SSE_MUL(z3, h3));

	_SSE_STORE(&q[nb*ldq],q1);
	_SSE_STORE(&q[(nb*ldq)+offset],q2);
	_SSE_STORE(&q[(nb*ldq)+2*offset],q3);

1217
#ifdef HAVE_SSE_INTRINSICS
1218
#ifdef DOUBLE_PRECISION_REAL
1219
	h1 = _mm_set1_pd(hh[nb-2]);