@@ -337,25 +337,23 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
337
337
Uint32 chanmask = sf -> Rmask | sf -> Gmask | sf -> Bmask ;
338
338
Uint32 amask = sf -> Amask ;
339
339
Uint32 ashift = sf -> Ashift ;
340
- Uint64 multmask ;
340
+ Uint64 multmask , multmask2 ;
341
341
342
- __m64 src1 , dst1 , mm_alpha , mm_zero , dmask ;
342
+ __m64 src1 , dst1 , mm_alpha , mm_zero , mm_alpha2 ;
343
343
344
344
mm_zero = _mm_setzero_si64 (); /* 0 -> mm_zero */
345
- multmask = 0xFFFF ;
346
- multmask <<= (ashift * 2 );
347
- multmask = ~multmask ;
348
- dmask = * (__m64 * ) & multmask ; /* dst alpha mask -> dmask */
345
+ multmask = 0x00FF ;
346
+ multmask <<= (ashift * 2 );
347
+ multmask2 = 0x00FF00FF00FF00FF ;
349
348
350
349
while (height -- ) {
351
350
/* *INDENT-OFF* */
352
351
DUFFS_LOOP4 ({
353
352
Uint32 alpha = * srcp & amask ;
354
353
if (alpha == 0 ) {
355
354
/* do nothing */
356
- } else if (alpha == amask ) {
357
- /* opaque alpha -- copy RGB, keep dst alpha */
358
- * dstp = (* srcp & chanmask ) | (* dstp & ~chanmask );
355
+ } else if (alpha == amask || (* dstp & amask ) == 0 ) {
356
+ * dstp = * srcp ;
359
357
} else {
360
358
src1 = _mm_cvtsi32_si64 (* srcp ); /* src(ARGB) -> src1 (0000ARGB)*/
361
359
src1 = _mm_unpacklo_pi8 (src1 , mm_zero ); /* 0A0R0G0B -> src1 */
@@ -366,15 +364,17 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
366
364
mm_alpha = _mm_cvtsi32_si64 (alpha ); /* alpha -> mm_alpha (0000000A) */
367
365
mm_alpha = _mm_srli_si64 (mm_alpha , ashift ); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
368
366
mm_alpha = _mm_unpacklo_pi16 (mm_alpha , mm_alpha ); /* 00000A0A -> mm_alpha */
369
- mm_alpha = _mm_unpacklo_pi32 (mm_alpha , mm_alpha ); /* 0A0A0A0A -> mm_alpha */
370
- mm_alpha = _mm_and_si64 (mm_alpha , dmask ); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
367
+ mm_alpha2 = _mm_unpacklo_pi32 (mm_alpha , mm_alpha ); /* 0A0A0A0A -> mm_alpha2 */
368
+ mm_alpha = _mm_or_si64 (mm_alpha2 , * (__m64 * ) & multmask ); /* 0F0A0A0A -> mm_alpha*/
369
+ mm_alpha2 = _mm_xor_si64 (mm_alpha2 , * (__m64 * ) & multmask2 ); /* 255 - mm_alpha -> mm_alpha*/
371
370
372
371
/* blend */
373
- src1 = _mm_sub_pi16 (src1 , dst1 );/* src1 - dst1 -> src1 */
374
- src1 = _mm_mullo_pi16 (src1 , mm_alpha ); /* (src1 - dst1) * alpha -> src1 */
375
- src1 = _mm_srli_pi16 (src1 , 8 ); /* src1 >> 8 -> src1(000R0G0B) */
376
- dst1 = _mm_add_pi8 (src1 , dst1 ); /* src1 + dst1 -> dst1(0A0R0G0B) */
377
- dst1 = _mm_packs_pu16 (dst1 , mm_zero ); /* 0000ARGB -> dst1 */
372
+ src1 = _mm_mullo_pi16 (src1 , mm_alpha );
373
+ src1 = _mm_srli_pi16 (src1 , 8 );
374
+ dst1 = _mm_mullo_pi16 (dst1 , mm_alpha2 );
375
+ dst1 = _mm_srli_pi16 (dst1 , 8 );
376
+ dst1 = _mm_add_pi16 (src1 , dst1 );
377
+ dst1 = _mm_packs_pu16 (dst1 , mm_zero );
378
378
379
379
* dstp = _mm_cvtsi64_si32 (dst1 ); /* dst1 -> pixel */
380
380
}
@@ -481,23 +481,24 @@ BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
481
481
compositioning used (>>8 instead of /255) doesn't handle
482
482
it correctly. Also special-case alpha=0 for speed?
483
483
Benchmark this! */
484
- if (alpha ) {
485
- if (alpha == SDL_ALPHA_OPAQUE ) {
486
- * dstp = ( s & 0x00ffffff ) | ( * dstp & 0xff000000 ) ;
484
+ if (alpha ) {
485
+ if (alpha == SDL_ALPHA_OPAQUE ) {
486
+ * dstp = * srcp ;
487
487
} else {
488
488
/*
489
489
* take out the middle component (green), and process
490
490
* the other two in parallel. One multiply less.
491
491
*/
492
492
d = * dstp ;
493
- dalpha = d & 0xff000000 ;
493
+ dalpha = d >> 24 ;
494
494
s1 = s & 0xff00ff ;
495
495
d1 = d & 0xff00ff ;
496
496
d1 = (d1 + ((s1 - d1 ) * alpha >> 8 )) & 0xff00ff ;
497
497
s &= 0xff00 ;
498
498
d &= 0xff00 ;
499
499
d = (d + ((s - d ) * alpha >> 8 )) & 0xff00 ;
500
- * dstp = d1 | d | dalpha ;
500
+ dalpha = alpha + (dalpha * (alpha ^ 0xFF ) >> 8 );
501
+ * dstp = d1 | d | (dalpha << 24 );
501
502
}
502
503
}
503
504
++ srcp ;
@@ -524,15 +525,14 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
524
525
Uint32 chanmask = sf -> Rmask | sf -> Gmask | sf -> Bmask ;
525
526
Uint32 amask = sf -> Amask ;
526
527
Uint32 ashift = sf -> Ashift ;
527
- Uint64 multmask ;
528
+ Uint64 multmask , multmask2 ;
528
529
529
- __m64 src1 , dst1 , mm_alpha , mm_zero , dmask ;
530
+ __m64 src1 , dst1 , mm_alpha , mm_zero , mm_alpha2 ;
530
531
531
532
mm_zero = _mm_setzero_si64 (); /* 0 -> mm_zero */
532
- multmask = 0xFFFF ;
533
+ multmask = 0x00FF ;
533
534
multmask <<= (ashift * 2 );
534
- multmask = ~multmask ;
535
- dmask = * (__m64 * ) & multmask ; /* dst alpha mask -> dmask */
535
+ multmask2 = 0x00FF00FF00FF00FF ;
536
536
537
537
while (height -- ) {
538
538
/* *INDENT-OFF* */
@@ -545,9 +545,8 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
545
545
alpha = * srcp & amask ;
546
546
if (alpha == 0 ) {
547
547
/* do nothing */
548
- } else if (alpha == amask ) {
549
- /* copy RGB, keep dst alpha */
550
- * dstp = (* srcp & chanmask ) | (* dstp & ~chanmask );
548
+ } else if (alpha == amask || (* dstp & amask ) == 0 ) {
549
+ * dstp = * srcp ;
551
550
} else {
552
551
src1 = _mm_cvtsi32_si64 (* srcp ); /* src(ARGB) -> src1 (0000ARGB)*/
553
552
src1 = _mm_unpacklo_pi8 (src1 , mm_zero ); /* 0A0R0G0B -> src1 */
@@ -558,15 +557,18 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
558
557
mm_alpha = _mm_cvtsi32_si64 (alpha ); /* alpha -> mm_alpha (0000000A) */
559
558
mm_alpha = _mm_srli_si64 (mm_alpha , ashift ); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
560
559
mm_alpha = _mm_unpacklo_pi16 (mm_alpha , mm_alpha ); /* 00000A0A -> mm_alpha */
561
- mm_alpha = _mm_unpacklo_pi32 (mm_alpha , mm_alpha ); /* 0A0A0A0A -> mm_alpha */
562
- mm_alpha = _mm_and_si64 (mm_alpha , dmask ); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
560
+ mm_alpha2 = _mm_unpacklo_pi32 (mm_alpha , mm_alpha ); /* 0A0A0A0A -> mm_alpha2 */
561
+ mm_alpha = _mm_or_si64 (mm_alpha2 , * (__m64 * ) & multmask ); /* 0F0A0A0A -> mm_alpha*/
562
+ mm_alpha2 = _mm_xor_si64 (mm_alpha2 , * (__m64 * ) & multmask2 ); /* 255 - mm_alpha -> mm_alpha*/
563
+
563
564
564
565
/* blend */
565
- src1 = _mm_sub_pi16 (src1 , dst1 );/* src - dst -> src1 */
566
- src1 = _mm_mullo_pi16 (src1 , mm_alpha ); /* (src - dst) * alpha -> src1 */
567
- src1 = _mm_srli_pi16 (src1 , 8 ); /* src1 >> 8 -> src1(000R0G0B) */
568
- dst1 = _mm_add_pi8 (src1 , dst1 ); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
569
- dst1 = _mm_packs_pu16 (dst1 , mm_zero ); /* 0000ARGB -> dst1 */
566
+ src1 = _mm_mullo_pi16 (src1 , mm_alpha );
567
+ src1 = _mm_srli_pi16 (src1 , 8 );
568
+ dst1 = _mm_mullo_pi16 (dst1 , mm_alpha2 );
569
+ dst1 = _mm_srli_pi16 (dst1 , 8 );
570
+ dst1 = _mm_add_pi16 (src1 , dst1 );
571
+ dst1 = _mm_packs_pu16 (dst1 , mm_zero );
570
572
571
573
* dstp = _mm_cvtsi64_si32 (dst1 ); /* dst1 -> pixel */
572
574
}
0 commit comments