Alexandre Savard | 1b09e31 | 2012-08-07 20:33:29 -0400 | [diff] [blame] | 1 | .ident "s390x.S, version 1.1" |
| 2 | // ==================================================================== |
| 3 | // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 4 | // project. |
| 5 | // |
| 6 | // Rights for redistribution and usage in source and binary forms are |
| 7 | // granted according to the OpenSSL license. Warranty of any kind is |
| 8 | // disclaimed. |
| 9 | // ==================================================================== |
| 10 | |
| 11 | .text |
| 12 | |
| 13 | #define zero %r0 |
| 14 | |
| 15 | // BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); |
| 16 | .globl bn_mul_add_words |
| 17 | .type bn_mul_add_words,@function |
| 18 | .align 4 |
| 19 | bn_mul_add_words: |
| 20 | lghi zero,0 // zero = 0 |
| 21 | la %r1,0(%r2) // put rp aside |
| 22 | lghi %r2,0 // i=0; |
| 23 | ltgfr %r4,%r4 |
| 24 | bler %r14 // if (len<=0) return 0; |
| 25 | |
| 26 | stmg %r6,%r10,48(%r15) |
| 27 | lghi %r10,3 |
| 28 | lghi %r8,0 // carry = 0 |
| 29 | nr %r10,%r4 // len%4 |
| 30 | sra %r4,2 // cnt=len/4 |
| 31 | jz .Loop1_madd // carry is incidentally cleared if branch taken |
| 32 | algr zero,zero // clear carry |
| 33 | |
| 34 | .Loop4_madd: |
| 35 | lg %r7,0(%r2,%r3) // ap[i] |
| 36 | mlgr %r6,%r5 // *=w |
| 37 | alcgr %r7,%r8 // +=carry |
| 38 | alcgr %r6,zero |
| 39 | alg %r7,0(%r2,%r1) // +=rp[i] |
| 40 | stg %r7,0(%r2,%r1) // rp[i]= |
| 41 | |
| 42 | lg %r9,8(%r2,%r3) |
| 43 | mlgr %r8,%r5 |
| 44 | alcgr %r9,%r6 |
| 45 | alcgr %r8,zero |
| 46 | alg %r9,8(%r2,%r1) |
| 47 | stg %r9,8(%r2,%r1) |
| 48 | |
| 49 | lg %r7,16(%r2,%r3) |
| 50 | mlgr %r6,%r5 |
| 51 | alcgr %r7,%r8 |
| 52 | alcgr %r6,zero |
| 53 | alg %r7,16(%r2,%r1) |
| 54 | stg %r7,16(%r2,%r1) |
| 55 | |
| 56 | lg %r9,24(%r2,%r3) |
| 57 | mlgr %r8,%r5 |
| 58 | alcgr %r9,%r6 |
| 59 | alcgr %r8,zero |
| 60 | alg %r9,24(%r2,%r1) |
| 61 | stg %r9,24(%r2,%r1) |
| 62 | |
| 63 | la %r2,32(%r2) // i+=4 |
| 64 | brct %r4,.Loop4_madd |
| 65 | |
| 66 | la %r10,1(%r10) // see if len%4 is zero ... |
| 67 | brct %r10,.Loop1_madd // without touching condition code:-) |
| 68 | |
| 69 | .Lend_madd: |
| 70 | alcgr %r8,zero // collect carry bit |
| 71 | lgr %r2,%r8 |
| 72 | lmg %r6,%r10,48(%r15) |
| 73 | br %r14 |
| 74 | |
| 75 | .Loop1_madd: |
| 76 | lg %r7,0(%r2,%r3) // ap[i] |
| 77 | mlgr %r6,%r5 // *=w |
| 78 | alcgr %r7,%r8 // +=carry |
| 79 | alcgr %r6,zero |
| 80 | alg %r7,0(%r2,%r1) // +=rp[i] |
| 81 | stg %r7,0(%r2,%r1) // rp[i]= |
| 82 | |
| 83 | lgr %r8,%r6 |
| 84 | la %r2,8(%r2) // i++ |
| 85 | brct %r10,.Loop1_madd |
| 86 | |
| 87 | j .Lend_madd |
| 88 | .size bn_mul_add_words,.-bn_mul_add_words |
| 89 | |
| 90 | // BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); |
| 91 | .globl bn_mul_words |
| 92 | .type bn_mul_words,@function |
| 93 | .align 4 |
| 94 | bn_mul_words: |
| 95 | lghi zero,0 // zero = 0 |
| 96 | la %r1,0(%r2) // put rp aside |
| 97 | lghi %r2,0 // i=0; |
| 98 | ltgfr %r4,%r4 |
| 99 | bler %r14 // if (len<=0) return 0; |
| 100 | |
| 101 | stmg %r6,%r10,48(%r15) |
| 102 | lghi %r10,3 |
| 103 | lghi %r8,0 // carry = 0 |
| 104 | nr %r10,%r4 // len%4 |
| 105 | sra %r4,2 // cnt=len/4 |
| 106 | jz .Loop1_mul // carry is incidentally cleared if branch taken |
| 107 | algr zero,zero // clear carry |
| 108 | |
| 109 | .Loop4_mul: |
| 110 | lg %r7,0(%r2,%r3) // ap[i] |
| 111 | mlgr %r6,%r5 // *=w |
| 112 | alcgr %r7,%r8 // +=carry |
| 113 | stg %r7,0(%r2,%r1) // rp[i]= |
| 114 | |
| 115 | lg %r9,8(%r2,%r3) |
| 116 | mlgr %r8,%r5 |
| 117 | alcgr %r9,%r6 |
| 118 | stg %r9,8(%r2,%r1) |
| 119 | |
| 120 | lg %r7,16(%r2,%r3) |
| 121 | mlgr %r6,%r5 |
| 122 | alcgr %r7,%r8 |
| 123 | stg %r7,16(%r2,%r1) |
| 124 | |
| 125 | lg %r9,24(%r2,%r3) |
| 126 | mlgr %r8,%r5 |
| 127 | alcgr %r9,%r6 |
| 128 | stg %r9,24(%r2,%r1) |
| 129 | |
| 130 | la %r2,32(%r2) // i+=4 |
| 131 | brct %r4,.Loop4_mul |
| 132 | |
| 133 | la %r10,1(%r10) // see if len%4 is zero ... |
| 134 | brct %r10,.Loop1_mul // without touching condition code:-) |
| 135 | |
| 136 | .Lend_mul: |
| 137 | alcgr %r8,zero // collect carry bit |
| 138 | lgr %r2,%r8 |
| 139 | lmg %r6,%r10,48(%r15) |
| 140 | br %r14 |
| 141 | |
| 142 | .Loop1_mul: |
| 143 | lg %r7,0(%r2,%r3) // ap[i] |
| 144 | mlgr %r6,%r5 // *=w |
| 145 | alcgr %r7,%r8 // +=carry |
| 146 | stg %r7,0(%r2,%r1) // rp[i]= |
| 147 | |
| 148 | lgr %r8,%r6 |
| 149 | la %r2,8(%r2) // i++ |
| 150 | brct %r10,.Loop1_mul |
| 151 | |
| 152 | j .Lend_mul |
| 153 | .size bn_mul_words,.-bn_mul_words |
| 154 | |
| 155 | // void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4) |
| 156 | .globl bn_sqr_words |
| 157 | .type bn_sqr_words,@function |
| 158 | .align 4 |
| 159 | bn_sqr_words: |
| 160 | ltgfr %r4,%r4 |
| 161 | bler %r14 |
| 162 | |
| 163 | stmg %r6,%r7,48(%r15) |
| 164 | srag %r1,%r4,2 // cnt=len/4 |
| 165 | jz .Loop1_sqr |
| 166 | |
| 167 | .Loop4_sqr: |
| 168 | lg %r7,0(%r3) |
| 169 | mlgr %r6,%r7 |
| 170 | stg %r7,0(%r2) |
| 171 | stg %r6,8(%r2) |
| 172 | |
| 173 | lg %r7,8(%r3) |
| 174 | mlgr %r6,%r7 |
| 175 | stg %r7,16(%r2) |
| 176 | stg %r6,24(%r2) |
| 177 | |
| 178 | lg %r7,16(%r3) |
| 179 | mlgr %r6,%r7 |
| 180 | stg %r7,32(%r2) |
| 181 | stg %r6,40(%r2) |
| 182 | |
| 183 | lg %r7,24(%r3) |
| 184 | mlgr %r6,%r7 |
| 185 | stg %r7,48(%r2) |
| 186 | stg %r6,56(%r2) |
| 187 | |
| 188 | la %r3,32(%r3) |
| 189 | la %r2,64(%r2) |
| 190 | brct %r1,.Loop4_sqr |
| 191 | |
| 192 | lghi %r1,3 |
| 193 | nr %r4,%r1 // cnt=len%4 |
| 194 | jz .Lend_sqr |
| 195 | |
| 196 | .Loop1_sqr: |
| 197 | lg %r7,0(%r3) |
| 198 | mlgr %r6,%r7 |
| 199 | stg %r7,0(%r2) |
| 200 | stg %r6,8(%r2) |
| 201 | |
| 202 | la %r3,8(%r3) |
| 203 | la %r2,16(%r2) |
| 204 | brct %r4,.Loop1_sqr |
| 205 | |
| 206 | .Lend_sqr: |
| 207 | lmg %r6,%r7,48(%r15) |
| 208 | br %r14 |
| 209 | .size bn_sqr_words,.-bn_sqr_words |
| 210 | |
| 211 | // BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d); |
| 212 | .globl bn_div_words |
| 213 | .type bn_div_words,@function |
| 214 | .align 4 |
| 215 | bn_div_words: |
| 216 | dlgr %r2,%r4 |
| 217 | lgr %r2,%r3 |
| 218 | br %r14 |
| 219 | .size bn_div_words,.-bn_div_words |
| 220 | |
| 221 | // BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); |
| 222 | .globl bn_add_words |
| 223 | .type bn_add_words,@function |
| 224 | .align 4 |
| 225 | bn_add_words: |
| 226 | la %r1,0(%r2) // put rp aside |
| 227 | lghi %r2,0 // i=0 |
| 228 | ltgfr %r5,%r5 |
| 229 | bler %r14 // if (len<=0) return 0; |
| 230 | |
| 231 | stg %r6,48(%r15) |
| 232 | lghi %r6,3 |
| 233 | nr %r6,%r5 // len%4 |
| 234 | sra %r5,2 // len/4, use sra because it sets condition code |
| 235 | jz .Loop1_add // carry is incidentally cleared if branch taken |
| 236 | algr %r2,%r2 // clear carry |
| 237 | |
| 238 | .Loop4_add: |
| 239 | lg %r0,0(%r2,%r3) |
| 240 | alcg %r0,0(%r2,%r4) |
| 241 | stg %r0,0(%r2,%r1) |
| 242 | lg %r0,8(%r2,%r3) |
| 243 | alcg %r0,8(%r2,%r4) |
| 244 | stg %r0,8(%r2,%r1) |
| 245 | lg %r0,16(%r2,%r3) |
| 246 | alcg %r0,16(%r2,%r4) |
| 247 | stg %r0,16(%r2,%r1) |
| 248 | lg %r0,24(%r2,%r3) |
| 249 | alcg %r0,24(%r2,%r4) |
| 250 | stg %r0,24(%r2,%r1) |
| 251 | |
| 252 | la %r2,32(%r2) // i+=4 |
| 253 | brct %r5,.Loop4_add |
| 254 | |
| 255 | la %r6,1(%r6) // see if len%4 is zero ... |
| 256 | brct %r6,.Loop1_add // without touching condition code:-) |
| 257 | |
| 258 | .Lexit_add: |
| 259 | lghi %r2,0 |
| 260 | alcgr %r2,%r2 |
| 261 | lg %r6,48(%r15) |
| 262 | br %r14 |
| 263 | |
| 264 | .Loop1_add: |
| 265 | lg %r0,0(%r2,%r3) |
| 266 | alcg %r0,0(%r2,%r4) |
| 267 | stg %r0,0(%r2,%r1) |
| 268 | |
| 269 | la %r2,8(%r2) // i++ |
| 270 | brct %r6,.Loop1_add |
| 271 | |
| 272 | j .Lexit_add |
| 273 | .size bn_add_words,.-bn_add_words |
| 274 | |
| 275 | // BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); |
| 276 | .globl bn_sub_words |
| 277 | .type bn_sub_words,@function |
| 278 | .align 4 |
| 279 | bn_sub_words: |
| 280 | la %r1,0(%r2) // put rp aside |
| 281 | lghi %r2,0 // i=0 |
| 282 | ltgfr %r5,%r5 |
| 283 | bler %r14 // if (len<=0) return 0; |
| 284 | |
| 285 | stg %r6,48(%r15) |
| 286 | lghi %r6,3 |
| 287 | nr %r6,%r5 // len%4 |
| 288 | sra %r5,2 // len/4, use sra because it sets condition code |
| 289 | jnz .Loop4_sub // borrow is incidentally cleared if branch taken |
| 290 | slgr %r2,%r2 // clear borrow |
| 291 | |
| 292 | .Loop1_sub: |
| 293 | lg %r0,0(%r2,%r3) |
| 294 | slbg %r0,0(%r2,%r4) |
| 295 | stg %r0,0(%r2,%r1) |
| 296 | |
| 297 | la %r2,8(%r2) // i++ |
| 298 | brct %r6,.Loop1_sub |
| 299 | j .Lexit_sub |
| 300 | |
| 301 | .Loop4_sub: |
| 302 | lg %r0,0(%r2,%r3) |
| 303 | slbg %r0,0(%r2,%r4) |
| 304 | stg %r0,0(%r2,%r1) |
| 305 | lg %r0,8(%r2,%r3) |
| 306 | slbg %r0,8(%r2,%r4) |
| 307 | stg %r0,8(%r2,%r1) |
| 308 | lg %r0,16(%r2,%r3) |
| 309 | slbg %r0,16(%r2,%r4) |
| 310 | stg %r0,16(%r2,%r1) |
| 311 | lg %r0,24(%r2,%r3) |
| 312 | slbg %r0,24(%r2,%r4) |
| 313 | stg %r0,24(%r2,%r1) |
| 314 | |
| 315 | la %r2,32(%r2) // i+=4 |
| 316 | brct %r5,.Loop4_sub |
| 317 | |
| 318 | la %r6,1(%r6) // see if len%4 is zero ... |
| 319 | brct %r6,.Loop1_sub // without touching condition code:-) |
| 320 | |
| 321 | .Lexit_sub: |
| 322 | lghi %r2,0 |
| 323 | slbgr %r2,%r2 |
| 324 | lcgr %r2,%r2 |
| 325 | lg %r6,48(%r15) |
| 326 | br %r14 |
| 327 | .size bn_sub_words,.-bn_sub_words |
| 328 | |
| 329 | #define c1 %r1 |
| 330 | #define c2 %r5 |
| 331 | #define c3 %r8 |
| 332 | |
| 333 | #define mul_add_c(ai,bi,c1,c2,c3) \ |
| 334 | lg %r7,ai*8(%r3); \ |
| 335 | mlg %r6,bi*8(%r4); \ |
| 336 | algr c1,%r7; \ |
| 337 | alcgr c2,%r6; \ |
| 338 | alcgr c3,zero |
| 339 | |
| 340 | // void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); |
| 341 | .globl bn_mul_comba8 |
| 342 | .type bn_mul_comba8,@function |
| 343 | .align 4 |
| 344 | bn_mul_comba8: |
| 345 | stmg %r6,%r8,48(%r15) |
| 346 | |
| 347 | lghi c1,0 |
| 348 | lghi c2,0 |
| 349 | lghi c3,0 |
| 350 | lghi zero,0 |
| 351 | |
| 352 | mul_add_c(0,0,c1,c2,c3); |
| 353 | stg c1,0*8(%r2) |
| 354 | lghi c1,0 |
| 355 | |
| 356 | mul_add_c(0,1,c2,c3,c1); |
| 357 | mul_add_c(1,0,c2,c3,c1); |
| 358 | stg c2,1*8(%r2) |
| 359 | lghi c2,0 |
| 360 | |
| 361 | mul_add_c(2,0,c3,c1,c2); |
| 362 | mul_add_c(1,1,c3,c1,c2); |
| 363 | mul_add_c(0,2,c3,c1,c2); |
| 364 | stg c3,2*8(%r2) |
| 365 | lghi c3,0 |
| 366 | |
| 367 | mul_add_c(0,3,c1,c2,c3); |
| 368 | mul_add_c(1,2,c1,c2,c3); |
| 369 | mul_add_c(2,1,c1,c2,c3); |
| 370 | mul_add_c(3,0,c1,c2,c3); |
| 371 | stg c1,3*8(%r2) |
| 372 | lghi c1,0 |
| 373 | |
| 374 | mul_add_c(4,0,c2,c3,c1); |
| 375 | mul_add_c(3,1,c2,c3,c1); |
| 376 | mul_add_c(2,2,c2,c3,c1); |
| 377 | mul_add_c(1,3,c2,c3,c1); |
| 378 | mul_add_c(0,4,c2,c3,c1); |
| 379 | stg c2,4*8(%r2) |
| 380 | lghi c2,0 |
| 381 | |
| 382 | mul_add_c(0,5,c3,c1,c2); |
| 383 | mul_add_c(1,4,c3,c1,c2); |
| 384 | mul_add_c(2,3,c3,c1,c2); |
| 385 | mul_add_c(3,2,c3,c1,c2); |
| 386 | mul_add_c(4,1,c3,c1,c2); |
| 387 | mul_add_c(5,0,c3,c1,c2); |
| 388 | stg c3,5*8(%r2) |
| 389 | lghi c3,0 |
| 390 | |
| 391 | mul_add_c(6,0,c1,c2,c3); |
| 392 | mul_add_c(5,1,c1,c2,c3); |
| 393 | mul_add_c(4,2,c1,c2,c3); |
| 394 | mul_add_c(3,3,c1,c2,c3); |
| 395 | mul_add_c(2,4,c1,c2,c3); |
| 396 | mul_add_c(1,5,c1,c2,c3); |
| 397 | mul_add_c(0,6,c1,c2,c3); |
| 398 | stg c1,6*8(%r2) |
| 399 | lghi c1,0 |
| 400 | |
| 401 | mul_add_c(0,7,c2,c3,c1); |
| 402 | mul_add_c(1,6,c2,c3,c1); |
| 403 | mul_add_c(2,5,c2,c3,c1); |
| 404 | mul_add_c(3,4,c2,c3,c1); |
| 405 | mul_add_c(4,3,c2,c3,c1); |
| 406 | mul_add_c(5,2,c2,c3,c1); |
| 407 | mul_add_c(6,1,c2,c3,c1); |
| 408 | mul_add_c(7,0,c2,c3,c1); |
| 409 | stg c2,7*8(%r2) |
| 410 | lghi c2,0 |
| 411 | |
| 412 | mul_add_c(7,1,c3,c1,c2); |
| 413 | mul_add_c(6,2,c3,c1,c2); |
| 414 | mul_add_c(5,3,c3,c1,c2); |
| 415 | mul_add_c(4,4,c3,c1,c2); |
| 416 | mul_add_c(3,5,c3,c1,c2); |
| 417 | mul_add_c(2,6,c3,c1,c2); |
| 418 | mul_add_c(1,7,c3,c1,c2); |
| 419 | stg c3,8*8(%r2) |
| 420 | lghi c3,0 |
| 421 | |
| 422 | mul_add_c(2,7,c1,c2,c3); |
| 423 | mul_add_c(3,6,c1,c2,c3); |
| 424 | mul_add_c(4,5,c1,c2,c3); |
| 425 | mul_add_c(5,4,c1,c2,c3); |
| 426 | mul_add_c(6,3,c1,c2,c3); |
| 427 | mul_add_c(7,2,c1,c2,c3); |
| 428 | stg c1,9*8(%r2) |
| 429 | lghi c1,0 |
| 430 | |
| 431 | mul_add_c(7,3,c2,c3,c1); |
| 432 | mul_add_c(6,4,c2,c3,c1); |
| 433 | mul_add_c(5,5,c2,c3,c1); |
| 434 | mul_add_c(4,6,c2,c3,c1); |
| 435 | mul_add_c(3,7,c2,c3,c1); |
| 436 | stg c2,10*8(%r2) |
| 437 | lghi c2,0 |
| 438 | |
| 439 | mul_add_c(4,7,c3,c1,c2); |
| 440 | mul_add_c(5,6,c3,c1,c2); |
| 441 | mul_add_c(6,5,c3,c1,c2); |
| 442 | mul_add_c(7,4,c3,c1,c2); |
| 443 | stg c3,11*8(%r2) |
| 444 | lghi c3,0 |
| 445 | |
| 446 | mul_add_c(7,5,c1,c2,c3); |
| 447 | mul_add_c(6,6,c1,c2,c3); |
| 448 | mul_add_c(5,7,c1,c2,c3); |
| 449 | stg c1,12*8(%r2) |
| 450 | lghi c1,0 |
| 451 | |
| 452 | |
| 453 | mul_add_c(6,7,c2,c3,c1); |
| 454 | mul_add_c(7,6,c2,c3,c1); |
| 455 | stg c2,13*8(%r2) |
| 456 | lghi c2,0 |
| 457 | |
| 458 | mul_add_c(7,7,c3,c1,c2); |
| 459 | stg c3,14*8(%r2) |
| 460 | stg c1,15*8(%r2) |
| 461 | |
| 462 | lmg %r6,%r8,48(%r15) |
| 463 | br %r14 |
| 464 | .size bn_mul_comba8,.-bn_mul_comba8 |
| 465 | |
| 466 | // void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); |
| 467 | .globl bn_mul_comba4 |
| 468 | .type bn_mul_comba4,@function |
| 469 | .align 4 |
| 470 | bn_mul_comba4: |
| 471 | stmg %r6,%r8,48(%r15) |
| 472 | |
| 473 | lghi c1,0 |
| 474 | lghi c2,0 |
| 475 | lghi c3,0 |
| 476 | lghi zero,0 |
| 477 | |
| 478 | mul_add_c(0,0,c1,c2,c3); |
| 479 | stg c1,0*8(%r3) |
| 480 | lghi c1,0 |
| 481 | |
| 482 | mul_add_c(0,1,c2,c3,c1); |
| 483 | mul_add_c(1,0,c2,c3,c1); |
| 484 | stg c2,1*8(%r2) |
| 485 | lghi c2,0 |
| 486 | |
| 487 | mul_add_c(2,0,c3,c1,c2); |
| 488 | mul_add_c(1,1,c3,c1,c2); |
| 489 | mul_add_c(0,2,c3,c1,c2); |
| 490 | stg c3,2*8(%r2) |
| 491 | lghi c3,0 |
| 492 | |
| 493 | mul_add_c(0,3,c1,c2,c3); |
| 494 | mul_add_c(1,2,c1,c2,c3); |
| 495 | mul_add_c(2,1,c1,c2,c3); |
| 496 | mul_add_c(3,0,c1,c2,c3); |
| 497 | stg c1,3*8(%r2) |
| 498 | lghi c1,0 |
| 499 | |
| 500 | mul_add_c(3,1,c2,c3,c1); |
| 501 | mul_add_c(2,2,c2,c3,c1); |
| 502 | mul_add_c(1,3,c2,c3,c1); |
| 503 | stg c2,4*8(%r2) |
| 504 | lghi c2,0 |
| 505 | |
| 506 | mul_add_c(2,3,c3,c1,c2); |
| 507 | mul_add_c(3,2,c3,c1,c2); |
| 508 | stg c3,5*8(%r2) |
| 509 | lghi c3,0 |
| 510 | |
| 511 | mul_add_c(3,3,c1,c2,c3); |
| 512 | stg c1,6*8(%r2) |
| 513 | stg c2,7*8(%r2) |
| 514 | |
| 515 | stmg %r6,%r8,48(%r15) |
| 516 | br %r14 |
| 517 | .size bn_mul_comba4,.-bn_mul_comba4 |
| 518 | |
| 519 | #define sqr_add_c(ai,c1,c2,c3) \ |
| 520 | lg %r7,ai*8(%r3); \ |
| 521 | mlgr %r6,%r7; \ |
| 522 | algr c1,%r7; \ |
| 523 | alcgr c2,%r6; \ |
| 524 | alcgr c3,zero |
| 525 | |
| 526 | #define sqr_add_c2(ai,aj,c1,c2,c3) \ |
| 527 | lg %r7,ai*8(%r3); \ |
| 528 | mlg %r6,aj*8(%r3); \ |
| 529 | algr c1,%r7; \ |
| 530 | alcgr c2,%r6; \ |
| 531 | alcgr c3,zero; \ |
| 532 | algr c1,%r7; \ |
| 533 | alcgr c2,%r6; \ |
| 534 | alcgr c3,zero |
| 535 | |
| 536 | // void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3); |
| 537 | .globl bn_sqr_comba8 |
| 538 | .type bn_sqr_comba8,@function |
| 539 | .align 4 |
| 540 | bn_sqr_comba8: |
| 541 | stmg %r6,%r8,48(%r15) |
| 542 | |
| 543 | lghi c1,0 |
| 544 | lghi c2,0 |
| 545 | lghi c3,0 |
| 546 | lghi zero,0 |
| 547 | |
| 548 | sqr_add_c(0,c1,c2,c3); |
| 549 | stg c1,0*8(%r2) |
| 550 | lghi c1,0 |
| 551 | |
| 552 | sqr_add_c2(1,0,c2,c3,c1); |
| 553 | stg c2,1*8(%r2) |
| 554 | lghi c2,0 |
| 555 | |
| 556 | sqr_add_c(1,c3,c1,c2); |
| 557 | sqr_add_c2(2,0,c3,c1,c2); |
| 558 | stg c3,2*8(%r2) |
| 559 | lghi c3,0 |
| 560 | |
| 561 | sqr_add_c2(3,0,c1,c2,c3); |
| 562 | sqr_add_c2(2,1,c1,c2,c3); |
| 563 | stg c1,3*8(%r2) |
| 564 | lghi c1,0 |
| 565 | |
| 566 | sqr_add_c(2,c2,c3,c1); |
| 567 | sqr_add_c2(3,1,c2,c3,c1); |
| 568 | sqr_add_c2(4,0,c2,c3,c1); |
| 569 | stg c2,4*8(%r2) |
| 570 | lghi c2,0 |
| 571 | |
| 572 | sqr_add_c2(5,0,c3,c1,c2); |
| 573 | sqr_add_c2(4,1,c3,c1,c2); |
| 574 | sqr_add_c2(3,2,c3,c1,c2); |
| 575 | stg c3,5*8(%r2) |
| 576 | lghi c3,0 |
| 577 | |
| 578 | sqr_add_c(3,c1,c2,c3); |
| 579 | sqr_add_c2(4,2,c1,c2,c3); |
| 580 | sqr_add_c2(5,1,c1,c2,c3); |
| 581 | sqr_add_c2(6,0,c1,c2,c3); |
| 582 | stg c1,6*8(%r2) |
| 583 | lghi c1,0 |
| 584 | |
| 585 | sqr_add_c2(7,0,c2,c3,c1); |
| 586 | sqr_add_c2(6,1,c2,c3,c1); |
| 587 | sqr_add_c2(5,2,c2,c3,c1); |
| 588 | sqr_add_c2(4,3,c2,c3,c1); |
| 589 | stg c2,7*8(%r2) |
| 590 | lghi c2,0 |
| 591 | |
| 592 | sqr_add_c(4,c3,c1,c2); |
| 593 | sqr_add_c2(5,3,c3,c1,c2); |
| 594 | sqr_add_c2(6,2,c3,c1,c2); |
| 595 | sqr_add_c2(7,1,c3,c1,c2); |
| 596 | stg c3,8*8(%r2) |
| 597 | lghi c3,0 |
| 598 | |
| 599 | sqr_add_c2(7,2,c1,c2,c3); |
| 600 | sqr_add_c2(6,3,c1,c2,c3); |
| 601 | sqr_add_c2(5,4,c1,c2,c3); |
| 602 | stg c1,9*8(%r2) |
| 603 | lghi c1,0 |
| 604 | |
| 605 | sqr_add_c(5,c2,c3,c1); |
| 606 | sqr_add_c2(6,4,c2,c3,c1); |
| 607 | sqr_add_c2(7,3,c2,c3,c1); |
| 608 | stg c2,10*8(%r2) |
| 609 | lghi c2,0 |
| 610 | |
| 611 | sqr_add_c2(7,4,c3,c1,c2); |
| 612 | sqr_add_c2(6,5,c3,c1,c2); |
| 613 | stg c3,11*8(%r2) |
| 614 | lghi c3,0 |
| 615 | |
| 616 | sqr_add_c(6,c1,c2,c3); |
| 617 | sqr_add_c2(7,5,c1,c2,c3); |
| 618 | stg c1,12*8(%r2) |
| 619 | lghi c1,0 |
| 620 | |
| 621 | sqr_add_c2(7,6,c2,c3,c1); |
| 622 | stg c2,13*8(%r2) |
| 623 | lghi c2,0 |
| 624 | |
| 625 | sqr_add_c(7,c3,c1,c2); |
| 626 | stg c3,14*8(%r2) |
| 627 | stg c1,15*8(%r2) |
| 628 | |
| 629 | lmg %r6,%r8,48(%r15) |
| 630 | br %r14 |
| 631 | .size bn_sqr_comba8,.-bn_sqr_comba8 |
| 632 | |
| 633 | // void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3); |
| 634 | .globl bn_sqr_comba4 |
| 635 | .type bn_sqr_comba4,@function |
| 636 | .align 4 |
| 637 | bn_sqr_comba4: |
| 638 | stmg %r6,%r8,48(%r15) |
| 639 | |
| 640 | lghi c1,0 |
| 641 | lghi c2,0 |
| 642 | lghi c3,0 |
| 643 | lghi zero,0 |
| 644 | |
| 645 | sqr_add_c(0,c1,c2,c3); |
| 646 | stg c1,0*8(%r2) |
| 647 | lghi c1,0 |
| 648 | |
| 649 | sqr_add_c2(1,0,c2,c3,c1); |
| 650 | stg c2,1*8(%r2) |
| 651 | lghi c2,0 |
| 652 | |
| 653 | sqr_add_c(1,c3,c1,c2); |
| 654 | sqr_add_c2(2,0,c3,c1,c2); |
| 655 | stg c3,2*8(%r2) |
| 656 | lghi c3,0 |
| 657 | |
| 658 | sqr_add_c2(3,0,c1,c2,c3); |
| 659 | sqr_add_c2(2,1,c1,c2,c3); |
| 660 | stg c1,3*8(%r2) |
| 661 | lghi c1,0 |
| 662 | |
| 663 | sqr_add_c(2,c2,c3,c1); |
| 664 | sqr_add_c2(3,1,c2,c3,c1); |
| 665 | stg c2,4*8(%r2) |
| 666 | lghi c2,0 |
| 667 | |
| 668 | sqr_add_c2(3,2,c3,c1,c2); |
| 669 | stg c3,5*8(%r2) |
| 670 | lghi c3,0 |
| 671 | |
| 672 | sqr_add_c(3,c1,c2,c3); |
| 673 | stg c1,6*8(%r2) |
| 674 | stg c2,7*8(%r2) |
| 675 | |
| 676 | lmg %r6,%r8,48(%r15) |
| 677 | br %r14 |
| 678 | .size bn_sqr_comba4,.-bn_sqr_comba4 |