Skip to content

Commit 89bc80f

Browse files
committed
Fixed alpha blending for the MMX blit functions
I see the Remarks of function SDL_BlitSurface shows that "when SDL_BLENDMODE_BLEND, we have dstA = srcA + (dstA * (1-srcA))". however, I tested some pictures but the result implies "dstA=arcA" actually. I stepped into the source code, and found after I set SDL_BLENDMODE_BLEND for the source surface, the final blit function is BlitRGBtoRGBPixelAlphaMMX when I use SDL_BlitSurface on my computer. And I found these codes: else if (alpha == amask) { /* opaque alpha -- copy RGB, keep dst alpha */ *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); The same code is used in BlitRGBtoRGBPixelAlphaMMX3DNOW and BlitRGBtoRGBPixelAlpha. So I think they still keep dst alpha. Best regards, Jianyu Guan
1 parent 6572847 commit 89bc80f

File tree

1 file changed

+38
-36
lines changed

1 file changed

+38
-36
lines changed

src/video/SDL_blit_A.c

Lines changed: 38 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -337,25 +337,23 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
337337
Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
338338
Uint32 amask = sf->Amask;
339339
Uint32 ashift = sf->Ashift;
340-
Uint64 multmask;
340+
Uint64 multmask, multmask2;
341341

342-
__m64 src1, dst1, mm_alpha, mm_zero, dmask;
342+
__m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
343343

344344
mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
345-
multmask = 0xFFFF;
346-
multmask <<= (ashift * 2);
347-
multmask = ~multmask;
348-
dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
345+
multmask = 0x00FF;
346+
multmask <<= (ashift * 2);
347+
multmask2 = 0x00FF00FF00FF00FF;
349348

350349
while (height--) {
351350
/* *INDENT-OFF* */
352351
DUFFS_LOOP4({
353352
Uint32 alpha = *srcp & amask;
354353
if (alpha == 0) {
355354
/* do nothing */
356-
} else if (alpha == amask) {
357-
/* opaque alpha -- copy RGB, keep dst alpha */
358-
*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
355+
} else if (alpha == amask || (*dstp & amask) == 0) {
356+
*dstp = *srcp;
359357
} else {
360358
src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
361359
src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
@@ -366,15 +364,17 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
366364
mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
367365
mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
368366
mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
369-
mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
370-
mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
367+
mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
368+
mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha*/
369+
mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha*/
371370

372371
/* blend */
373-
src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
374-
src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
375-
src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
376-
dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
377-
dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
372+
src1 = _mm_mullo_pi16(src1, mm_alpha);
373+
src1 = _mm_srli_pi16(src1, 8);
374+
dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
375+
dst1 = _mm_srli_pi16(dst1, 8);
376+
dst1 = _mm_add_pi16(src1, dst1);
377+
dst1 = _mm_packs_pu16(dst1, mm_zero);
378378

379379
*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
380380
}
@@ -481,23 +481,24 @@ BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
481481
compositioning used (>>8 instead of /255) doesn't handle
482482
it correctly. Also special-case alpha=0 for speed?
483483
Benchmark this! */
484-
if(alpha) {
485-
if(alpha == SDL_ALPHA_OPAQUE) {
486-
*dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
484+
if (alpha) {
485+
if (alpha == SDL_ALPHA_OPAQUE) {
486+
*dstp = *srcp;
487487
} else {
488488
/*
489489
* take out the middle component (green), and process
490490
* the other two in parallel. One multiply less.
491491
*/
492492
d = *dstp;
493-
dalpha = d & 0xff000000;
493+
dalpha = d >> 24;
494494
s1 = s & 0xff00ff;
495495
d1 = d & 0xff00ff;
496496
d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
497497
s &= 0xff00;
498498
d &= 0xff00;
499499
d = (d + ((s - d) * alpha >> 8)) & 0xff00;
500-
*dstp = d1 | d | dalpha;
500+
dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
501+
*dstp = d1 | d | (dalpha << 24);
501502
}
502503
}
503504
++srcp;
@@ -524,15 +525,14 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
524525
Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
525526
Uint32 amask = sf->Amask;
526527
Uint32 ashift = sf->Ashift;
527-
Uint64 multmask;
528+
Uint64 multmask, multmask2;
528529

529-
__m64 src1, dst1, mm_alpha, mm_zero, dmask;
530+
__m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
530531

531532
mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
532-
multmask = 0xFFFF;
533+
multmask = 0x00FF;
533534
multmask <<= (ashift * 2);
534-
multmask = ~multmask;
535-
dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
535+
multmask2 = 0x00FF00FF00FF00FF;
536536

537537
while (height--) {
538538
/* *INDENT-OFF* */
@@ -545,9 +545,8 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
545545
alpha = *srcp & amask;
546546
if (alpha == 0) {
547547
/* do nothing */
548-
} else if (alpha == amask) {
549-
/* copy RGB, keep dst alpha */
550-
*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
548+
} else if (alpha == amask || (*dstp & amask) == 0) {
549+
*dstp = *srcp;
551550
} else {
552551
src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
553552
src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
@@ -558,15 +557,18 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
558557
mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
559558
mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
560559
mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
561-
mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
562-
mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
560+
mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
561+
mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha*/
562+
mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha*/
563+
563564

564565
/* blend */
565-
src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
566-
src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
567-
src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
568-
dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
569-
dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
566+
src1 = _mm_mullo_pi16(src1, mm_alpha);
567+
src1 = _mm_srli_pi16(src1, 8);
568+
dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
569+
dst1 = _mm_srli_pi16(dst1, 8);
570+
dst1 = _mm_add_pi16(src1, dst1);
571+
dst1 = _mm_packs_pu16(dst1, mm_zero);
570572

571573
*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
572574
}

0 commit comments

Comments
 (0)