21 #include "../SDL_internal.h"
47 const unsigned A = info->
a;
62 if ( palmap ==
NULL ) {
63 *
dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
65 *
dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
92 unsigned sR, sG, sB, sA;
108 if ( palmap ==
NULL ) {
109 *
dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
111 *
dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
141 const unsigned A = info->
a;
148 if ( Pixel != ckey ) {
157 if ( palmap ==
NULL ) {
158 *
dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
160 *
dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
187 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
189 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);
190 lmask = _mm_set_pi32(0x00010101, 0x00010101);
191 dsta = _mm_set_pi32(dalpha, dalpha);
198 *dstp++ = ((((
s & 0x00fefefe) + (
d & 0x00fefefe)) >> 1)
199 + (
s &
d & 0x00010101)) | dalpha;
203 for (
n >>= 1;
n > 0; --
n) {
204 dst1 = *(__m64 *) dstp;
207 src1 = *(__m64 *) srcp;
210 dst2 = _mm_and_si64(dst2, hmask);
211 src2 = _mm_and_si64(src2, hmask);
212 src2 = _mm_add_pi32(src2, dst2);
213 src2 = _mm_srli_pi32(src2, 1);
215 dst1 = _mm_and_si64(dst1, src1);
216 dst1 = _mm_and_si64(dst1, lmask);
217 dst1 = _mm_add_pi32(dst1, src2);
218 dst1 = _mm_or_si64(dst1, dsta);
220 *(__m64 *) dstp = dst1;
241 BlitRGBtoRGBSurfaceAlpha128MMX(info);
252 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
254 mm_zero = _mm_setzero_si64();
257 amult = amult | (amult << 16);
259 (0xff << df->
Rshift) | (0xff << df->
260 Gshift) | (0xff << df->
Bshift);
261 mm_alpha = _mm_set_pi32(0, amult & chanmask);
262 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero);
264 dsta = _mm_set_pi32(dalpha, dalpha);
270 src2 = _mm_cvtsi32_si64(*srcp);
271 src2 = _mm_unpacklo_pi8(src2, mm_zero);
273 dst1 = _mm_cvtsi32_si64(*dstp);
274 dst1 = _mm_unpacklo_pi8(dst1, mm_zero);
276 src2 = _mm_sub_pi16(src2, dst1);
277 src2 = _mm_mullo_pi16(src2, mm_alpha);
278 src2 = _mm_srli_pi16(src2, 8);
279 dst1 = _mm_add_pi8(src2, dst1);
281 dst1 = _mm_packs_pu16(dst1, mm_zero);
282 dst1 = _mm_or_si64(dst1, dsta);
283 *dstp = _mm_cvtsi64_si32(dst1);
291 for (
n >>= 1;
n > 0; --
n) {
293 src1 = *(__m64 *) srcp;
295 src1 = _mm_unpacklo_pi8(src1, mm_zero);
296 src2 = _mm_unpackhi_pi8(src2, mm_zero);
298 dst1 = *(__m64 *) dstp;
300 dst1 = _mm_unpacklo_pi8(dst1, mm_zero);
301 dst2 = _mm_unpackhi_pi8(dst2, mm_zero);
303 src1 = _mm_sub_pi16(src1, dst1);
304 src1 = _mm_mullo_pi16(src1, mm_alpha);
305 src1 = _mm_srli_pi16(src1, 8);
306 dst1 = _mm_add_pi8(src1, dst1);
308 src2 = _mm_sub_pi16(src2, dst2);
309 src2 = _mm_mullo_pi16(src2, mm_alpha);
310 src2 = _mm_srli_pi16(src2, 8);
311 dst2 = _mm_add_pi8(src2, dst2);
313 dst1 = _mm_packs_pu16(dst1, dst2);
314 dst1 = _mm_or_si64(dst1, dsta);
316 *(__m64 *) dstp = dst1;
341 Uint64 multmask, multmask2;
343 __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
345 mm_zero = _mm_setzero_si64();
347 multmask <<= (ashift * 2);
348 multmask2 = 0x00FF00FF00FF00FFULL;
356 }
else if (
alpha == amask) {
359 src1 = _mm_cvtsi32_si64(*srcp);
360 src1 = _mm_unpacklo_pi8(src1, mm_zero);
362 dst1 = _mm_cvtsi32_si64(*dstp);
363 dst1 = _mm_unpacklo_pi8(dst1, mm_zero);
365 mm_alpha = _mm_cvtsi32_si64(
alpha);
366 mm_alpha = _mm_srli_si64(mm_alpha, ashift);
367 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);
368 mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha);
369 mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);
370 mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);
373 src1 = _mm_mullo_pi16(src1, mm_alpha);
374 src1 = _mm_srli_pi16(src1, 8);
375 dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
376 dst1 = _mm_srli_pi16(dst1, 8);
377 dst1 = _mm_add_pi16(src1, dst1);
378 dst1 = _mm_packs_pu16(dst1, mm_zero);
380 *dstp = _mm_cvtsi64_si32(dst1);
394 #if SDL_ARM_SIMD_BLITTERS
407 BlitARGBto565PixelAlphaARMSIMDAsm(
width,
height, dstp, dststride, srcp, srcstride);
422 BlitRGBtoRGBPixelAlphaARMSIMDAsm(
width,
height, dstp, dststride, srcp, srcstride);
426 #if SDL_ARM_NEON_BLITTERS
439 BlitARGBto565PixelAlphaARMNEONAsm(
width,
height, dstp, dststride, srcp, srcstride);
454 BlitRGBtoRGBPixelAlphaARMNEONAsm(
width,
height, dstp, dststride, srcp, srcstride);
474 *dstp++ = ((((
s & 0x00fefefe) + (
d & 0x00fefefe)) >> 1)
475 + (
s &
d & 0x00010101)) | 0xff000000;
509 d1 = (d1 + ((
s1 - d1) *
alpha >> 8))
513 d = (
d + ((
s -
d) *
alpha >> 8)) & 0xff00;
514 *dstp = d1 |
d | 0xff000000;
561 d1 = (d1 + ((
s1 - d1) *
alpha >> 8)) & 0xff00ff;
564 d = (
d + ((
s -
d) *
alpha >> 8)) & 0xff00;
565 dalpha =
alpha + (dalpha * (
alpha ^ 0xFF) >> 8);
566 *dstp = d1 |
d | (dalpha << 24);
592 Uint64 multmask, multmask2;
594 __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
596 mm_zero = _mm_setzero_si64();
598 multmask <<= (ashift * 2);
599 multmask2 = 0x00FF00FF00FF00FFULL;
606 _m_prefetch(srcp + 16);
607 _m_prefetch(dstp + 16);
609 alpha = *srcp & amask;
612 }
else if (
alpha == amask) {
615 src1 = _mm_cvtsi32_si64(*srcp);
616 src1 = _mm_unpacklo_pi8(src1, mm_zero);
618 dst1 = _mm_cvtsi32_si64(*dstp);
619 dst1 = _mm_unpacklo_pi8(dst1, mm_zero);
621 mm_alpha = _mm_cvtsi32_si64(
alpha);
622 mm_alpha = _mm_srli_si64(mm_alpha, ashift);
623 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);
624 mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha);
625 mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);
626 mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);
630 src1 = _mm_mullo_pi16(src1, mm_alpha);
631 src1 = _mm_srli_pi16(src1, 8);
632 dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
633 dst1 = _mm_srli_pi16(dst1, 8);
634 dst1 = _mm_add_pi16(src1, dst1);
635 dst1 = _mm_packs_pu16(dst1, mm_zero);
637 *dstp = _mm_cvtsi64_si32(dst1);
654 #define BLEND16_50(d, s, mask) \
655 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
658 #define BLEND2x16_50(d, s, mask) \
659 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
660 + (s & d & (~(mask | mask << 16))))
693 prev_sw = ((
Uint32 *) srcp)[-1];
699 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
700 s = (prev_sw << 16) + (sw >> 16);
702 s = (prev_sw >> 16) + (sw << 16);
714 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
779 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
781 alpha &= ~(1 + 2 + 4);
782 mm_alpha = _mm_set_pi32(0,
alpha);
785 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);
786 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);
789 mm_alpha = _mm_slli_si64(mm_alpha, 3);
792 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);
793 bmask = _mm_set_pi32(0x001F001F, 0x001F001F);
806 s = (
s |
s << 16) & 0x07e0f81f;
807 d = (
d |
d << 16) & 0x07e0f81f;
819 s = (
s |
s << 16) & 0x07e0f81f;
820 d = (
d |
d << 16) & 0x07e0f81f;
831 s = (
s |
s << 16) & 0x07e0f81f;
832 d = (
d |
d << 16) & 0x07e0f81f;
837 src1 = *(__m64*)srcp;
838 dst1 = *(__m64*)dstp;
842 src2 = _mm_srli_pi16(src2, 11);
845 dst2 = _mm_srli_pi16(dst2, 11);
848 src2 = _mm_sub_pi16(src2, dst2);
849 src2 = _mm_mullo_pi16(src2, mm_alpha);
850 src2 = _mm_srli_pi16(src2, 11);
851 dst2 = _mm_add_pi16(src2, dst2);
852 dst2 = _mm_slli_pi16(dst2, 11);
858 src2 = _mm_and_si64(src2, gmask);
861 dst2 = _mm_and_si64(dst2, gmask);
864 src2 = _mm_sub_pi16(src2, dst2);
865 src2 = _mm_mulhi_pi16(src2, mm_alpha);
866 src2 = _mm_slli_pi16(src2, 5);
867 dst2 = _mm_add_pi16(src2, dst2);
869 mm_res = _mm_or_si64(mm_res, dst2);
873 src2 = _mm_and_si64(src2, bmask);
876 dst2 = _mm_and_si64(dst2, bmask);
879 src2 = _mm_sub_pi16(src2, dst2);
880 src2 = _mm_mullo_pi16(src2, mm_alpha);
881 src2 = _mm_srli_pi16(src2, 11);
882 dst2 = _mm_add_pi16(src2, dst2);
883 dst2 = _mm_and_si64(dst2, bmask);
885 mm_res = _mm_or_si64(mm_res, dst2);
887 *(__m64*)dstp = mm_res;
916 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
918 alpha &= ~(1 + 2 + 4);
919 mm_alpha = _mm_set_pi32(0,
alpha);
922 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);
923 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);
926 mm_alpha = _mm_slli_si64(mm_alpha, 3);
929 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);
930 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);
931 bmask = _mm_set_pi32(0x001F001F, 0x001F001F);
944 s = (
s |
s << 16) & 0x03e07c1f;
945 d = (
d |
d << 16) & 0x03e07c1f;
957 s = (
s |
s << 16) & 0x03e07c1f;
958 d = (
d |
d << 16) & 0x03e07c1f;
969 s = (
s |
s << 16) & 0x03e07c1f;
970 d = (
d |
d << 16) & 0x03e07c1f;
975 src1 = *(__m64*)srcp;
976 dst1 = *(__m64*)dstp;
980 src2 = _mm_and_si64(src2, rmask);
983 dst2 = _mm_and_si64(dst2, rmask);
986 src2 = _mm_sub_pi16(src2, dst2);
987 src2 = _mm_mulhi_pi16(src2, mm_alpha);
988 src2 = _mm_slli_pi16(src2, 5);
989 dst2 = _mm_add_pi16(src2, dst2);
990 dst2 = _mm_and_si64(dst2, rmask);
996 src2 = _mm_and_si64(src2, gmask);
999 dst2 = _mm_and_si64(dst2, gmask);
1002 src2 = _mm_sub_pi16(src2, dst2);
1003 src2 = _mm_mulhi_pi16(src2, mm_alpha);
1004 src2 = _mm_slli_pi16(src2, 5);
1005 dst2 = _mm_add_pi16(src2, dst2);
1007 mm_res = _mm_or_si64(mm_res, dst2);
1011 src2 = _mm_and_si64(src2, bmask);
1014 dst2 = _mm_and_si64(dst2, bmask);
1017 src2 = _mm_sub_pi16(src2, dst2);
1018 src2 = _mm_mullo_pi16(src2, mm_alpha);
1019 src2 = _mm_srli_pi16(src2, 11);
1020 dst2 = _mm_add_pi16(src2, dst2);
1021 dst2 = _mm_and_si64(dst2, bmask);
1023 mm_res = _mm_or_si64(mm_res, dst2);
1025 *(__m64*)dstp = mm_res;
1044 unsigned alpha = info->
a;
1066 s = (
s |
s << 16) & 0x07e0f81f;
1067 d = (
d |
d << 16) & 0x07e0f81f;
1083 unsigned alpha = info->
a;
1105 s = (
s |
s << 16) & 0x03e07c1f;
1106 d = (
d |
d << 16) & 0x03e07c1f;
1133 unsigned alpha =
s >> 27;
1140 *dstp = (
Uint16)((
s >> 8 & 0xf800) + (
s >> 5 & 0x7e0) + (
s >> 3 & 0x1f));
1147 s = ((
s & 0xfc00) << 11) + (
s >> 8 & 0xf800)
1149 d = (
d |
d << 16) & 0x07e0f81f;
1187 *dstp = (
Uint16)((
s >> 9 & 0x7c00) + (
s >> 6 & 0x3e0) + (
s >> 3 & 0x1f));
1194 s = ((
s & 0xf800) << 10) + (
s >> 9 & 0x7c00)
1196 d = (
d |
d << 16) & 0x03e07c1f;
1226 unsigned sR, sG, sB;
1227 unsigned dR, dG, dB, dA;
1228 const unsigned sA = info->
a;
1266 unsigned sR, sG, sB;
1267 unsigned dR, dG, dB, dA;
1268 const unsigned sA = info->
a;
1275 if(sA && Pixel != ckey) {
1306 unsigned sR, sG, sB, sA;
1307 unsigned dR, dG, dB, dA;
1353 #if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
1357 || (sf->
Bmask == 0xff && df->
Bmask == 0x1f)))
1359 #if SDL_ARM_NEON_BLITTERS
1361 return BlitARGBto565PixelAlphaARMNEON;
1363 #if SDL_ARM_SIMD_BLITTERS
1365 return BlitARGBto565PixelAlphaARMSIMD;
1370 && sf->
Gmask == 0xff00
1372 || (sf->
Bmask == 0xff && df->
Bmask == 0x1f))) {
1373 if (df->
Gmask == 0x7e0)
1375 else if (df->
Gmask == 0x3e0)
1384 #if defined(__MMX__) || defined(__3dNOW__)
1391 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
1395 return BlitRGBtoRGBPixelAlphaMMX;
1399 if (sf->
Amask == 0xff000000) {
1400 #if SDL_ARM_NEON_BLITTERS
1402 return BlitRGBtoRGBPixelAlphaARMNEON;
1404 #if SDL_ARM_SIMD_BLITTERS
1406 return BlitRGBtoRGBPixelAlphaARMSIMD;
1420 if (sf->
Amask == 0) {
1433 if (df->
Gmask == 0x7e0) {
1436 return Blit565to565SurfaceAlphaMMX;
1440 }
else if (df->
Gmask == 0x3e0) {
1443 return Blit555to555SurfaceAlphaMMX;
1459 return BlitRGBtoRGBSurfaceAlphaMMX;
1475 if (sf->
Amask == 0) {