SDL  2.0
SDL_blit_A.c
Go to the documentation of this file.
1 /*
2  Simple DirectMedia Layer
3  Copyright (C) 1997-2020 Sam Lantinga <slouken@libsdl.org>
4 
5  This software is provided 'as-is', without any express or implied
6  warranty. In no event will the authors be held liable for any damages
7  arising from the use of this software.
8 
9  Permission is granted to anyone to use this software for any purpose,
10  including commercial applications, and to alter it and redistribute it
11  freely, subject to the following restrictions:
12 
13  1. The origin of this software must not be misrepresented; you must not
14  claim that you wrote the original software. If you use this software
15  in a product, an acknowledgment in the product documentation would be
16  appreciated but is not required.
17  2. Altered source versions must be plainly marked as such, and must not be
18  misrepresented as being the original software.
19  3. This notice may not be removed or altered from any source distribution.
20 */
21 #include "../SDL_internal.h"
22 
23 #if SDL_HAVE_BLIT_A
24 
25 #include "SDL_video.h"
26 #include "SDL_blit.h"
27 
28 /* Functions to perform alpha blended blitting */
29 
30 /* N->1 blending with per-surface alpha */
31 static void
33 {
34  int width = info->dst_w;
35  int height = info->dst_h;
36  Uint8 *src = info->src;
37  int srcskip = info->src_skip;
38  Uint8 *dst = info->dst;
39  int dstskip = info->dst_skip;
40  Uint8 *palmap = info->table;
41  SDL_PixelFormat *srcfmt = info->src_fmt;
42  SDL_PixelFormat *dstfmt = info->dst_fmt;
43  int srcbpp = srcfmt->BytesPerPixel;
44  Uint32 Pixel;
45  unsigned sR, sG, sB;
46  unsigned dR, dG, dB;
47  const unsigned A = info->a;
48 
49  while (height--) {
50  /* *INDENT-OFF* */
52  {
53  DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
54  dR = dstfmt->palette->colors[*dst].r;
55  dG = dstfmt->palette->colors[*dst].g;
56  dB = dstfmt->palette->colors[*dst].b;
57  ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
58  dR &= 0xff;
59  dG &= 0xff;
60  dB &= 0xff;
61  /* Pack RGB into 8bit pixel */
62  if ( palmap == NULL ) {
63  *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
64  } else {
65  *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
66  }
67  dst++;
68  src += srcbpp;
69  },
70  width);
71  /* *INDENT-ON* */
72  src += srcskip;
73  dst += dstskip;
74  }
75 }
76 
77 /* N->1 blending with pixel alpha */
78 static void
80 {
81  int width = info->dst_w;
82  int height = info->dst_h;
83  Uint8 *src = info->src;
84  int srcskip = info->src_skip;
85  Uint8 *dst = info->dst;
86  int dstskip = info->dst_skip;
87  Uint8 *palmap = info->table;
88  SDL_PixelFormat *srcfmt = info->src_fmt;
89  SDL_PixelFormat *dstfmt = info->dst_fmt;
90  int srcbpp = srcfmt->BytesPerPixel;
91  Uint32 Pixel;
92  unsigned sR, sG, sB, sA;
93  unsigned dR, dG, dB;
94 
95  while (height--) {
96  /* *INDENT-OFF* */
98  {
99  DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
100  dR = dstfmt->palette->colors[*dst].r;
101  dG = dstfmt->palette->colors[*dst].g;
102  dB = dstfmt->palette->colors[*dst].b;
103  ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
104  dR &= 0xff;
105  dG &= 0xff;
106  dB &= 0xff;
107  /* Pack RGB into 8bit pixel */
108  if ( palmap == NULL ) {
109  *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
110  } else {
111  *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
112  }
113  dst++;
114  src += srcbpp;
115  },
116  width);
117  /* *INDENT-ON* */
118  src += srcskip;
119  dst += dstskip;
120  }
121 }
122 
123 /* colorkeyed N->1 blending with per-surface alpha */
124 static void
126 {
127  int width = info->dst_w;
128  int height = info->dst_h;
129  Uint8 *src = info->src;
130  int srcskip = info->src_skip;
131  Uint8 *dst = info->dst;
132  int dstskip = info->dst_skip;
133  Uint8 *palmap = info->table;
134  SDL_PixelFormat *srcfmt = info->src_fmt;
135  SDL_PixelFormat *dstfmt = info->dst_fmt;
136  int srcbpp = srcfmt->BytesPerPixel;
137  Uint32 ckey = info->colorkey;
138  Uint32 Pixel;
139  unsigned sR, sG, sB;
140  unsigned dR, dG, dB;
141  const unsigned A = info->a;
142 
143  while (height--) {
144  /* *INDENT-OFF* */
145  DUFFS_LOOP(
146  {
147  DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
148  if ( Pixel != ckey ) {
149  dR = dstfmt->palette->colors[*dst].r;
150  dG = dstfmt->palette->colors[*dst].g;
151  dB = dstfmt->palette->colors[*dst].b;
152  ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
153  dR &= 0xff;
154  dG &= 0xff;
155  dB &= 0xff;
156  /* Pack RGB into 8bit pixel */
157  if ( palmap == NULL ) {
158  *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
159  } else {
160  *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
161  }
162  }
163  dst++;
164  src += srcbpp;
165  },
166  width);
167  /* *INDENT-ON* */
168  src += srcskip;
169  dst += dstskip;
170  }
171 }
172 
173 #ifdef __MMX__
174 
175 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
176 static void
177 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
178 {
179  int width = info->dst_w;
180  int height = info->dst_h;
181  Uint32 *srcp = (Uint32 *) info->src;
182  int srcskip = info->src_skip >> 2;
183  Uint32 *dstp = (Uint32 *) info->dst;
184  int dstskip = info->dst_skip >> 2;
185  Uint32 dalpha = info->dst_fmt->Amask;
186 
187  __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
188 
189  hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
190  lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
191  dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
192 
193  while (height--) {
194  int n = width;
195  if (n & 1) {
196  Uint32 s = *srcp++;
197  Uint32 d = *dstp;
198  *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
199  + (s & d & 0x00010101)) | dalpha;
200  n--;
201  }
202 
203  for (n >>= 1; n > 0; --n) {
204  dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
205  dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
206 
207  src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
208  src2 = src1; /* 2 x src -> src2(ARGBARGB) */
209 
210  dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
211  src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
212  src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
213  src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
214 
215  dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
216  dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
217  dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
218  dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
219 
220  *(__m64 *) dstp = dst1; /* dst1 -> 2 x dst pixels */
221  dstp += 2;
222  srcp += 2;
223  }
224 
225  srcp += srcskip;
226  dstp += dstskip;
227  }
228  _mm_empty();
229 }
230 
231 /* fast RGB888->(A)RGB888 blending with surface alpha */
232 static void
233 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
234 {
235  SDL_PixelFormat *df = info->dst_fmt;
236  Uint32 chanmask;
237  unsigned alpha = info->a;
238 
239  if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
240  /* only call a128 version when R,G,B occupy lower bits */
241  BlitRGBtoRGBSurfaceAlpha128MMX(info);
242  } else {
243  int width = info->dst_w;
244  int height = info->dst_h;
245  Uint32 *srcp = (Uint32 *) info->src;
246  int srcskip = info->src_skip >> 2;
247  Uint32 *dstp = (Uint32 *) info->dst;
248  int dstskip = info->dst_skip >> 2;
249  Uint32 dalpha = df->Amask;
250  Uint32 amult;
251 
252  __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
253 
254  mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
255  /* form the alpha mult */
256  amult = alpha | (alpha << 8);
257  amult = amult | (amult << 16);
258  chanmask =
259  (0xff << df->Rshift) | (0xff << df->
260  Gshift) | (0xff << df->Bshift);
261  mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
262  mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
263  /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
264  dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
265 
266  while (height--) {
267  int n = width;
268  if (n & 1) {
269  /* One Pixel Blend */
270  src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
271  src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
272 
273  dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
274  dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
275 
276  src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
277  src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
278  src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
279  dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
280 
281  dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
282  dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
283  *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
284 
285  ++srcp;
286  ++dstp;
287 
288  n--;
289  }
290 
291  for (n >>= 1; n > 0; --n) {
292  /* Two Pixels Blend */
293  src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
294  src2 = src1; /* 2 x src -> src2(ARGBARGB) */
295  src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
296  src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
297 
298  dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
299  dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
300  dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
301  dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
302 
303  src1 = _mm_sub_pi16(src1, dst1); /* src1 - dst1 -> src1 */
304  src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
305  src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
306  dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
307 
308  src2 = _mm_sub_pi16(src2, dst2); /* src2 - dst2 -> src2 */
309  src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
310  src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
311  dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
312 
313  dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
314  dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
315 
316  *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
317 
318  srcp += 2;
319  dstp += 2;
320  }
321  srcp += srcskip;
322  dstp += dstskip;
323  }
324  _mm_empty();
325  }
326 }
327 
328 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
329 static void
330 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
331 {
332  int width = info->dst_w;
333  int height = info->dst_h;
334  Uint32 *srcp = (Uint32 *) info->src;
335  int srcskip = info->src_skip >> 2;
336  Uint32 *dstp = (Uint32 *) info->dst;
337  int dstskip = info->dst_skip >> 2;
338  SDL_PixelFormat *sf = info->src_fmt;
339  Uint32 amask = sf->Amask;
340  Uint32 ashift = sf->Ashift;
341  Uint64 multmask, multmask2;
342 
343  __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
344 
345  mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
346  multmask = 0x00FF;
347  multmask <<= (ashift * 2);
348  multmask2 = 0x00FF00FF00FF00FFULL;
349 
350  while (height--) {
351  /* *INDENT-OFF* */
352  DUFFS_LOOP4({
353  Uint32 alpha = *srcp & amask;
354  if (alpha == 0) {
355  /* do nothing */
356  } else if (alpha == amask) {
357  *dstp = *srcp;
358  } else {
359  src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
360  src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
361 
362  dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
363  dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
364 
365  mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
366  mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
367  mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
368  mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
369  mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */
370  mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */
371 
372  /* blend */
373  src1 = _mm_mullo_pi16(src1, mm_alpha);
374  src1 = _mm_srli_pi16(src1, 8);
375  dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
376  dst1 = _mm_srli_pi16(dst1, 8);
377  dst1 = _mm_add_pi16(src1, dst1);
378  dst1 = _mm_packs_pu16(dst1, mm_zero);
379 
380  *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
381  }
382  ++srcp;
383  ++dstp;
384  }, width);
385  /* *INDENT-ON* */
386  srcp += srcskip;
387  dstp += dstskip;
388  }
389  _mm_empty();
390 }
391 
392 #endif /* __MMX__ */
393 
394 #if SDL_ARM_SIMD_BLITTERS
395 void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
396 
397 static void
398 BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info)
399 {
400  int32_t width = info->dst_w;
401  int32_t height = info->dst_h;
402  uint16_t *dstp = (uint16_t *)info->dst;
403  int32_t dststride = width + (info->dst_skip >> 1);
404  uint32_t *srcp = (uint32_t *)info->src;
405  int32_t srcstride = width + (info->src_skip >> 2);
406 
407  BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
408 }
409 
410 void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
411 
412 static void
413 BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
414 {
415  int32_t width = info->dst_w;
416  int32_t height = info->dst_h;
417  uint32_t *dstp = (uint32_t *)info->dst;
418  int32_t dststride = width + (info->dst_skip >> 2);
419  uint32_t *srcp = (uint32_t *)info->src;
420  int32_t srcstride = width + (info->src_skip >> 2);
421 
422  BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
423 }
424 #endif
425 
426 #if SDL_ARM_NEON_BLITTERS
427 void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
428 
429 static void
430 BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info)
431 {
432  int32_t width = info->dst_w;
433  int32_t height = info->dst_h;
434  uint16_t *dstp = (uint16_t *)info->dst;
435  int32_t dststride = width + (info->dst_skip >> 1);
436  uint32_t *srcp = (uint32_t *)info->src;
437  int32_t srcstride = width + (info->src_skip >> 2);
438 
439  BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
440 }
441 
442 void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
443 
444 static void
445 BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info)
446 {
447  int32_t width = info->dst_w;
448  int32_t height = info->dst_h;
449  uint32_t *dstp = (uint32_t *)info->dst;
450  int32_t dststride = width + (info->dst_skip >> 2);
451  uint32_t *srcp = (uint32_t *)info->src;
452  int32_t srcstride = width + (info->src_skip >> 2);
453 
454  BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
455 }
456 #endif
457 
458 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
459 static void
461 {
462  int width = info->dst_w;
463  int height = info->dst_h;
464  Uint32 *srcp = (Uint32 *) info->src;
465  int srcskip = info->src_skip >> 2;
466  Uint32 *dstp = (Uint32 *) info->dst;
467  int dstskip = info->dst_skip >> 2;
468 
469  while (height--) {
470  /* *INDENT-OFF* */
471  DUFFS_LOOP4({
472  Uint32 s = *srcp++;
473  Uint32 d = *dstp;
474  *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
475  + (s & d & 0x00010101)) | 0xff000000;
476  }, width);
477  /* *INDENT-ON* */
478  srcp += srcskip;
479  dstp += dstskip;
480  }
481 }
482 
483 /* fast RGB888->(A)RGB888 blending with surface alpha */
484 static void
486 {
487  unsigned alpha = info->a;
488  if (alpha == 128) {
490  } else {
491  int width = info->dst_w;
492  int height = info->dst_h;
493  Uint32 *srcp = (Uint32 *) info->src;
494  int srcskip = info->src_skip >> 2;
495  Uint32 *dstp = (Uint32 *) info->dst;
496  int dstskip = info->dst_skip >> 2;
497  Uint32 s;
498  Uint32 d;
499  Uint32 s1;
500  Uint32 d1;
501 
502  while (height--) {
503  /* *INDENT-OFF* */
504  DUFFS_LOOP4({
505  s = *srcp;
506  d = *dstp;
507  s1 = s & 0xff00ff;
508  d1 = d & 0xff00ff;
509  d1 = (d1 + ((s1 - d1) * alpha >> 8))
510  & 0xff00ff;
511  s &= 0xff00;
512  d &= 0xff00;
513  d = (d + ((s - d) * alpha >> 8)) & 0xff00;
514  *dstp = d1 | d | 0xff000000;
515  ++srcp;
516  ++dstp;
517  }, width);
518  /* *INDENT-ON* */
519  srcp += srcskip;
520  dstp += dstskip;
521  }
522  }
523 }
524 
525 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
526 static void
528 {
529  int width = info->dst_w;
530  int height = info->dst_h;
531  Uint32 *srcp = (Uint32 *) info->src;
532  int srcskip = info->src_skip >> 2;
533  Uint32 *dstp = (Uint32 *) info->dst;
534  int dstskip = info->dst_skip >> 2;
535 
536  while (height--) {
537  /* *INDENT-OFF* */
538  DUFFS_LOOP4({
539  Uint32 dalpha;
540  Uint32 d;
541  Uint32 s1;
542  Uint32 d1;
543  Uint32 s = *srcp;
544  Uint32 alpha = s >> 24;
545  /* FIXME: Here we special-case opaque alpha since the
546  compositioning used (>>8 instead of /255) doesn't handle
547  it correctly. Also special-case alpha=0 for speed?
548  Benchmark this! */
549  if (alpha) {
550  if (alpha == SDL_ALPHA_OPAQUE) {
551  *dstp = *srcp;
552  } else {
553  /*
554  * take out the middle component (green), and process
555  * the other two in parallel. One multiply less.
556  */
557  d = *dstp;
558  dalpha = d >> 24;
559  s1 = s & 0xff00ff;
560  d1 = d & 0xff00ff;
561  d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
562  s &= 0xff00;
563  d &= 0xff00;
564  d = (d + ((s - d) * alpha >> 8)) & 0xff00;
565  dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
566  *dstp = d1 | d | (dalpha << 24);
567  }
568  }
569  ++srcp;
570  ++dstp;
571  }, width);
572  /* *INDENT-ON* */
573  srcp += srcskip;
574  dstp += dstskip;
575  }
576 }
577 
578 #ifdef __3dNOW__
579 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
580 static void
581 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
582 {
583  int width = info->dst_w;
584  int height = info->dst_h;
585  Uint32 *srcp = (Uint32 *) info->src;
586  int srcskip = info->src_skip >> 2;
587  Uint32 *dstp = (Uint32 *) info->dst;
588  int dstskip = info->dst_skip >> 2;
589  SDL_PixelFormat *sf = info->src_fmt;
590  Uint32 amask = sf->Amask;
591  Uint32 ashift = sf->Ashift;
592  Uint64 multmask, multmask2;
593 
594  __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
595 
596  mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
597  multmask = 0x00FF;
598  multmask <<= (ashift * 2);
599  multmask2 = 0x00FF00FF00FF00FFULL;
600 
601  while (height--) {
602  /* *INDENT-OFF* */
603  DUFFS_LOOP4({
604  Uint32 alpha;
605 
606  _m_prefetch(srcp + 16);
607  _m_prefetch(dstp + 16);
608 
609  alpha = *srcp & amask;
610  if (alpha == 0) {
611  /* do nothing */
612  } else if (alpha == amask) {
613  *dstp = *srcp;
614  } else {
615  src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
616  src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
617 
618  dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
619  dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
620 
621  mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
622  mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
623  mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
624  mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
625  mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */
626  mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */
627 
628 
629  /* blend */
630  src1 = _mm_mullo_pi16(src1, mm_alpha);
631  src1 = _mm_srli_pi16(src1, 8);
632  dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
633  dst1 = _mm_srli_pi16(dst1, 8);
634  dst1 = _mm_add_pi16(src1, dst1);
635  dst1 = _mm_packs_pu16(dst1, mm_zero);
636 
637  *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
638  }
639  ++srcp;
640  ++dstp;
641  }, width);
642  /* *INDENT-ON* */
643  srcp += srcskip;
644  dstp += dstskip;
645  }
646  _mm_empty();
647 }
648 
649 #endif /* __3dNOW__ */
650 
651 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
652 
653 /* blend a single 16 bit pixel at 50% */
654 #define BLEND16_50(d, s, mask) \
655  ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
656 
657 /* blend two 16 bit pixels at 50% */
658 #define BLEND2x16_50(d, s, mask) \
659  (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
660  + (s & d & (~(mask | mask << 16))))
661 
662 static void
664 {
665  int width = info->dst_w;
666  int height = info->dst_h;
667  Uint16 *srcp = (Uint16 *) info->src;
668  int srcskip = info->src_skip >> 1;
669  Uint16 *dstp = (Uint16 *) info->dst;
670  int dstskip = info->dst_skip >> 1;
671 
672  while (height--) {
673  if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
674  /*
675  * Source and destination not aligned, pipeline it.
676  * This is mostly a win for big blits but no loss for
677  * small ones
678  */
679  Uint32 prev_sw;
680  int w = width;
681 
682  /* handle odd destination */
683  if ((uintptr_t) dstp & 2) {
684  Uint16 d = *dstp, s = *srcp;
685  *dstp = BLEND16_50(d, s, mask);
686  dstp++;
687  srcp++;
688  w--;
689  }
690  srcp++; /* srcp is now 32-bit aligned */
691 
692  /* bootstrap pipeline with first halfword */
693  prev_sw = ((Uint32 *) srcp)[-1];
694 
695  while (w > 1) {
696  Uint32 sw, dw, s;
697  sw = *(Uint32 *) srcp;
698  dw = *(Uint32 *) dstp;
699 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
700  s = (prev_sw << 16) + (sw >> 16);
701 #else
702  s = (prev_sw >> 16) + (sw << 16);
703 #endif
704  prev_sw = sw;
705  *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
706  dstp += 2;
707  srcp += 2;
708  w -= 2;
709  }
710 
711  /* final pixel if any */
712  if (w) {
713  Uint16 d = *dstp, s;
714 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
715  s = (Uint16) prev_sw;
716 #else
717  s = (Uint16) (prev_sw >> 16);
718 #endif
719  *dstp = BLEND16_50(d, s, mask);
720  srcp++;
721  dstp++;
722  }
723  srcp += srcskip - 1;
724  dstp += dstskip;
725  } else {
726  /* source and destination are aligned */
727  int w = width;
728 
729  /* first odd pixel? */
730  if ((uintptr_t) srcp & 2) {
731  Uint16 d = *dstp, s = *srcp;
732  *dstp = BLEND16_50(d, s, mask);
733  srcp++;
734  dstp++;
735  w--;
736  }
737  /* srcp and dstp are now 32-bit aligned */
738 
739  while (w > 1) {
740  Uint32 sw = *(Uint32 *) srcp;
741  Uint32 dw = *(Uint32 *) dstp;
742  *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
743  srcp += 2;
744  dstp += 2;
745  w -= 2;
746  }
747 
748  /* last odd pixel? */
749  if (w) {
750  Uint16 d = *dstp, s = *srcp;
751  *dstp = BLEND16_50(d, s, mask);
752  srcp++;
753  dstp++;
754  }
755  srcp += srcskip;
756  dstp += dstskip;
757  }
758  }
759 }
760 
761 #ifdef __MMX__
762 
763 /* fast RGB565->RGB565 blending with surface alpha */
764 static void
765 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
766 {
767  unsigned alpha = info->a;
768  if (alpha == 128) {
769  Blit16to16SurfaceAlpha128(info, 0xf7de);
770  } else {
771  int width = info->dst_w;
772  int height = info->dst_h;
773  Uint16 *srcp = (Uint16 *) info->src;
774  int srcskip = info->src_skip >> 1;
775  Uint16 *dstp = (Uint16 *) info->dst;
776  int dstskip = info->dst_skip >> 1;
777  Uint32 s, d;
778 
779  __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
780 
781  alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
782  mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
783  alpha >>= 3; /* downscale alpha to 5 bits */
784 
785  mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
786  mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
787  /* position alpha to allow for mullo and mulhi on diff channels
788  to reduce the number of operations */
789  mm_alpha = _mm_slli_si64(mm_alpha, 3);
790 
791  /* Setup the 565 color channel masks */
792  gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
793  bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
794 
795  while (height--) {
796  /* *INDENT-OFF* */
798  {
799  s = *srcp++;
800  d = *dstp;
801  /*
802  * shift out the middle component (green) to
803  * the high 16 bits, and process all three RGB
804  * components at the same time.
805  */
806  s = (s | s << 16) & 0x07e0f81f;
807  d = (d | d << 16) & 0x07e0f81f;
808  d += (s - d) * alpha >> 5;
809  d &= 0x07e0f81f;
810  *dstp++ = (Uint16)(d | d >> 16);
811  },{
812  s = *srcp++;
813  d = *dstp;
814  /*
815  * shift out the middle component (green) to
816  * the high 16 bits, and process all three RGB
817  * components at the same time.
818  */
819  s = (s | s << 16) & 0x07e0f81f;
820  d = (d | d << 16) & 0x07e0f81f;
821  d += (s - d) * alpha >> 5;
822  d &= 0x07e0f81f;
823  *dstp++ = (Uint16)(d | d >> 16);
824  s = *srcp++;
825  d = *dstp;
826  /*
827  * shift out the middle component (green) to
828  * the high 16 bits, and process all three RGB
829  * components at the same time.
830  */
831  s = (s | s << 16) & 0x07e0f81f;
832  d = (d | d << 16) & 0x07e0f81f;
833  d += (s - d) * alpha >> 5;
834  d &= 0x07e0f81f;
835  *dstp++ = (Uint16)(d | d >> 16);
836  },{
837  src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
838  dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
839 
840  /* red */
841  src2 = src1;
842  src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
843 
844  dst2 = dst1;
845  dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
846 
847  /* blend */
848  src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
849  src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
850  src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
851  dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
852  dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
853 
854  mm_res = dst2; /* RED -> mm_res */
855 
856  /* green -- process the bits in place */
857  src2 = src1;
858  src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
859 
860  dst2 = dst1;
861  dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
862 
863  /* blend */
864  src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
865  src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
866  src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
867  dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
868 
869  mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
870 
871  /* blue */
872  src2 = src1;
873  src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
874 
875  dst2 = dst1;
876  dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
877 
878  /* blend */
879  src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
880  src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
881  src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
882  dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
883  dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
884 
885  mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
886 
887  *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
888 
889  srcp += 4;
890  dstp += 4;
891  }, width);
892  /* *INDENT-ON* */
893  srcp += srcskip;
894  dstp += dstskip;
895  }
896  _mm_empty();
897  }
898 }
899 
900 /* fast RGB555->RGB555 blending with surface alpha */
901 static void
902 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
903 {
904  unsigned alpha = info->a;
905  if (alpha == 128) {
906  Blit16to16SurfaceAlpha128(info, 0xfbde);
907  } else {
908  int width = info->dst_w;
909  int height = info->dst_h;
910  Uint16 *srcp = (Uint16 *) info->src;
911  int srcskip = info->src_skip >> 1;
912  Uint16 *dstp = (Uint16 *) info->dst;
913  int dstskip = info->dst_skip >> 1;
914  Uint32 s, d;
915 
916  __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
917 
918  alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
919  mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
920  alpha >>= 3; /* downscale alpha to 5 bits */
921 
922  mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
923  mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
924  /* position alpha to allow for mullo and mulhi on diff channels
925  to reduce the number of operations */
926  mm_alpha = _mm_slli_si64(mm_alpha, 3);
927 
928  /* Setup the 555 color channel masks */
929  rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
930  gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
931  bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
932 
933  while (height--) {
934  /* *INDENT-OFF* */
936  {
937  s = *srcp++;
938  d = *dstp;
939  /*
940  * shift out the middle component (green) to
941  * the high 16 bits, and process all three RGB
942  * components at the same time.
943  */
944  s = (s | s << 16) & 0x03e07c1f;
945  d = (d | d << 16) & 0x03e07c1f;
946  d += (s - d) * alpha >> 5;
947  d &= 0x03e07c1f;
948  *dstp++ = (Uint16)(d | d >> 16);
949  },{
950  s = *srcp++;
951  d = *dstp;
952  /*
953  * shift out the middle component (green) to
954  * the high 16 bits, and process all three RGB
955  * components at the same time.
956  */
957  s = (s | s << 16) & 0x03e07c1f;
958  d = (d | d << 16) & 0x03e07c1f;
959  d += (s - d) * alpha >> 5;
960  d &= 0x03e07c1f;
961  *dstp++ = (Uint16)(d | d >> 16);
962  s = *srcp++;
963  d = *dstp;
964  /*
965  * shift out the middle component (green) to
966  * the high 16 bits, and process all three RGB
967  * components at the same time.
968  */
969  s = (s | s << 16) & 0x03e07c1f;
970  d = (d | d << 16) & 0x03e07c1f;
971  d += (s - d) * alpha >> 5;
972  d &= 0x03e07c1f;
973  *dstp++ = (Uint16)(d | d >> 16);
974  },{
975  src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
976  dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
977 
978  /* red -- process the bits in place */
979  src2 = src1;
980  src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
981 
982  dst2 = dst1;
983  dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
984 
985  /* blend */
986  src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
987  src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
988  src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
989  dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
990  dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
991 
992  mm_res = dst2; /* RED -> mm_res */
993 
994  /* green -- process the bits in place */
995  src2 = src1;
996  src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
997 
998  dst2 = dst1;
999  dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
1000 
1001  /* blend */
1002  src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
1003  src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
1004  src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
1005  dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
1006 
1007  mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
1008 
1009  /* blue */
1010  src2 = src1; /* src -> src2 */
1011  src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
1012 
1013  dst2 = dst1; /* dst -> dst2 */
1014  dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
1015 
1016  /* blend */
1017  src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
1018  src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
1019  src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
1020  dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
1021  dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
1022 
1023  mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
1024 
1025  *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
1026 
1027  srcp += 4;
1028  dstp += 4;
1029  }, width);
1030  /* *INDENT-ON* */
1031  srcp += srcskip;
1032  dstp += dstskip;
1033  }
1034  _mm_empty();
1035  }
1036 }
1037 
1038 #endif /* __MMX__ */
1039 
1040 /* fast RGB565->RGB565 blending with surface alpha */
1041 static void
1043 {
1044  unsigned alpha = info->a;
1045  if (alpha == 128) {
1046  Blit16to16SurfaceAlpha128(info, 0xf7de);
1047  } else {
1048  int width = info->dst_w;
1049  int height = info->dst_h;
1050  Uint16 *srcp = (Uint16 *) info->src;
1051  int srcskip = info->src_skip >> 1;
1052  Uint16 *dstp = (Uint16 *) info->dst;
1053  int dstskip = info->dst_skip >> 1;
1054  alpha >>= 3; /* downscale alpha to 5 bits */
1055 
1056  while (height--) {
1057  /* *INDENT-OFF* */
1058  DUFFS_LOOP4({
1059  Uint32 s = *srcp++;
1060  Uint32 d = *dstp;
1061  /*
1062  * shift out the middle component (green) to
1063  * the high 16 bits, and process all three RGB
1064  * components at the same time.
1065  */
1066  s = (s | s << 16) & 0x07e0f81f;
1067  d = (d | d << 16) & 0x07e0f81f;
1068  d += (s - d) * alpha >> 5;
1069  d &= 0x07e0f81f;
1070  *dstp++ = (Uint16)(d | d >> 16);
1071  }, width);
1072  /* *INDENT-ON* */
1073  srcp += srcskip;
1074  dstp += dstskip;
1075  }
1076  }
1077 }
1078 
1079 /* fast RGB555->RGB555 blending with surface alpha */
1080 static void
1082 {
1083  unsigned alpha = info->a; /* downscale alpha to 5 bits */
1084  if (alpha == 128) {
1085  Blit16to16SurfaceAlpha128(info, 0xfbde);
1086  } else {
1087  int width = info->dst_w;
1088  int height = info->dst_h;
1089  Uint16 *srcp = (Uint16 *) info->src;
1090  int srcskip = info->src_skip >> 1;
1091  Uint16 *dstp = (Uint16 *) info->dst;
1092  int dstskip = info->dst_skip >> 1;
1093  alpha >>= 3; /* downscale alpha to 5 bits */
1094 
1095  while (height--) {
1096  /* *INDENT-OFF* */
1097  DUFFS_LOOP4({
1098  Uint32 s = *srcp++;
1099  Uint32 d = *dstp;
1100  /*
1101  * shift out the middle component (green) to
1102  * the high 16 bits, and process all three RGB
1103  * components at the same time.
1104  */
1105  s = (s | s << 16) & 0x03e07c1f;
1106  d = (d | d << 16) & 0x03e07c1f;
1107  d += (s - d) * alpha >> 5;
1108  d &= 0x03e07c1f;
1109  *dstp++ = (Uint16)(d | d >> 16);
1110  }, width);
1111  /* *INDENT-ON* */
1112  srcp += srcskip;
1113  dstp += dstskip;
1114  }
1115  }
1116 }
1117 
1118 /* fast ARGB8888->RGB565 blending with pixel alpha */
1119 static void
1121 {
1122  int width = info->dst_w;
1123  int height = info->dst_h;
1124  Uint32 *srcp = (Uint32 *) info->src;
1125  int srcskip = info->src_skip >> 2;
1126  Uint16 *dstp = (Uint16 *) info->dst;
1127  int dstskip = info->dst_skip >> 1;
1128 
1129  while (height--) {
1130  /* *INDENT-OFF* */
1131  DUFFS_LOOP4({
1132  Uint32 s = *srcp;
1133  unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
1134  /* FIXME: Here we special-case opaque alpha since the
1135  compositioning used (>>8 instead of /255) doesn't handle
1136  it correctly. Also special-case alpha=0 for speed?
1137  Benchmark this! */
1138  if(alpha) {
1139  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1140  *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
1141  } else {
1142  Uint32 d = *dstp;
1143  /*
1144  * convert source and destination to G0RAB65565
1145  * and blend all components at the same time
1146  */
1147  s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
1148  + (s >> 3 & 0x1f);
1149  d = (d | d << 16) & 0x07e0f81f;
1150  d += (s - d) * alpha >> 5;
1151  d &= 0x07e0f81f;
1152  *dstp = (Uint16)(d | d >> 16);
1153  }
1154  }
1155  srcp++;
1156  dstp++;
1157  }, width);
1158  /* *INDENT-ON* */
1159  srcp += srcskip;
1160  dstp += dstskip;
1161  }
1162 }
1163 
1164 /* fast ARGB8888->RGB555 blending with pixel alpha */
1165 static void
1167 {
1168  int width = info->dst_w;
1169  int height = info->dst_h;
1170  Uint32 *srcp = (Uint32 *) info->src;
1171  int srcskip = info->src_skip >> 2;
1172  Uint16 *dstp = (Uint16 *) info->dst;
1173  int dstskip = info->dst_skip >> 1;
1174 
1175  while (height--) {
1176  /* *INDENT-OFF* */
1177  DUFFS_LOOP4({
1178  unsigned alpha;
1179  Uint32 s = *srcp;
1180  alpha = s >> 27; /* downscale alpha to 5 bits */
1181  /* FIXME: Here we special-case opaque alpha since the
1182  compositioning used (>>8 instead of /255) doesn't handle
1183  it correctly. Also special-case alpha=0 for speed?
1184  Benchmark this! */
1185  if(alpha) {
1186  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1187  *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
1188  } else {
1189  Uint32 d = *dstp;
1190  /*
1191  * convert source and destination to G0RAB65565
1192  * and blend all components at the same time
1193  */
1194  s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
1195  + (s >> 3 & 0x1f);
1196  d = (d | d << 16) & 0x03e07c1f;
1197  d += (s - d) * alpha >> 5;
1198  d &= 0x03e07c1f;
1199  *dstp = (Uint16)(d | d >> 16);
1200  }
1201  }
1202  srcp++;
1203  dstp++;
1204  }, width);
1205  /* *INDENT-ON* */
1206  srcp += srcskip;
1207  dstp += dstskip;
1208  }
1209 }
1210 
1211 /* General (slow) N->N blending with per-surface alpha */
1212 static void
1214 {
1215  int width = info->dst_w;
1216  int height = info->dst_h;
1217  Uint8 *src = info->src;
1218  int srcskip = info->src_skip;
1219  Uint8 *dst = info->dst;
1220  int dstskip = info->dst_skip;
1221  SDL_PixelFormat *srcfmt = info->src_fmt;
1222  SDL_PixelFormat *dstfmt = info->dst_fmt;
1223  int srcbpp = srcfmt->BytesPerPixel;
1224  int dstbpp = dstfmt->BytesPerPixel;
1225  Uint32 Pixel;
1226  unsigned sR, sG, sB;
1227  unsigned dR, dG, dB, dA;
1228  const unsigned sA = info->a;
1229 
1230  if (sA) {
1231  while (height--) {
1232  /* *INDENT-OFF* */
1233  DUFFS_LOOP4(
1234  {
1235  DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
1236  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1237  ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1238  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1239  src += srcbpp;
1240  dst += dstbpp;
1241  },
1242  width);
1243  /* *INDENT-ON* */
1244  src += srcskip;
1245  dst += dstskip;
1246  }
1247  }
1248 }
1249 
1250 /* General (slow) colorkeyed N->N blending with per-surface alpha */
1251 static void
1253 {
1254  int width = info->dst_w;
1255  int height = info->dst_h;
1256  Uint8 *src = info->src;
1257  int srcskip = info->src_skip;
1258  Uint8 *dst = info->dst;
1259  int dstskip = info->dst_skip;
1260  SDL_PixelFormat *srcfmt = info->src_fmt;
1261  SDL_PixelFormat *dstfmt = info->dst_fmt;
1262  Uint32 ckey = info->colorkey;
1263  int srcbpp = srcfmt->BytesPerPixel;
1264  int dstbpp = dstfmt->BytesPerPixel;
1265  Uint32 Pixel;
1266  unsigned sR, sG, sB;
1267  unsigned dR, dG, dB, dA;
1268  const unsigned sA = info->a;
1269 
1270  while (height--) {
1271  /* *INDENT-OFF* */
1272  DUFFS_LOOP4(
1273  {
1274  RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
1275  if(sA && Pixel != ckey) {
1276  RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
1277  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1278  ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1279  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1280  }
1281  src += srcbpp;
1282  dst += dstbpp;
1283  },
1284  width);
1285  /* *INDENT-ON* */
1286  src += srcskip;
1287  dst += dstskip;
1288  }
1289 }
1290 
1291 /* General (slow) N->N blending with pixel alpha */
1292 static void
1294 {
1295  int width = info->dst_w;
1296  int height = info->dst_h;
1297  Uint8 *src = info->src;
1298  int srcskip = info->src_skip;
1299  Uint8 *dst = info->dst;
1300  int dstskip = info->dst_skip;
1301  SDL_PixelFormat *srcfmt = info->src_fmt;
1302  SDL_PixelFormat *dstfmt = info->dst_fmt;
1303  int srcbpp;
1304  int dstbpp;
1305  Uint32 Pixel;
1306  unsigned sR, sG, sB, sA;
1307  unsigned dR, dG, dB, dA;
1308 
1309  /* Set up some basic variables */
1310  srcbpp = srcfmt->BytesPerPixel;
1311  dstbpp = dstfmt->BytesPerPixel;
1312 
1313  while (height--) {
1314  /* *INDENT-OFF* */
1315  DUFFS_LOOP4(
1316  {
1317  DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
1318  if(sA) {
1319  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1320  ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1321  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1322  }
1323  src += srcbpp;
1324  dst += dstbpp;
1325  },
1326  width);
1327  /* *INDENT-ON* */
1328  src += srcskip;
1329  dst += dstskip;
1330  }
1331 }
1332 
1333 
1336 {
1338  SDL_PixelFormat *df = surface->map->dst->format;
1339 
1340  switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
1341  case SDL_COPY_BLEND:
1342  /* Per-pixel alpha blits */
1343  switch (df->BytesPerPixel) {
1344  case 1:
1345  if (df->palette != NULL) {
1346  return BlitNto1PixelAlpha;
1347  } else {
1348  /* RGB332 has no palette ! */
1349  return BlitNtoNPixelAlpha;
1350  }
1351 
1352  case 2:
1353 #if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
1354  if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
1355  && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
1356  && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
1357  || (sf->Bmask == 0xff && df->Bmask == 0x1f)))
1358  {
1359 #if SDL_ARM_NEON_BLITTERS
1360  if (SDL_HasNEON())
1361  return BlitARGBto565PixelAlphaARMNEON;
1362 #endif
1363 #if SDL_ARM_SIMD_BLITTERS
1364  if (SDL_HasARMSIMD())
1365  return BlitARGBto565PixelAlphaARMSIMD;
1366 #endif
1367  }
1368 #endif
1369  if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
1370  && sf->Gmask == 0xff00
1371  && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
1372  || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
1373  if (df->Gmask == 0x7e0)
1374  return BlitARGBto565PixelAlpha;
1375  else if (df->Gmask == 0x3e0)
1376  return BlitARGBto555PixelAlpha;
1377  }
1378  return BlitNtoNPixelAlpha;
1379 
1380  case 4:
1381  if (sf->Rmask == df->Rmask
1382  && sf->Gmask == df->Gmask
1383  && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1384 #if defined(__MMX__) || defined(__3dNOW__)
1385  if (sf->Rshift % 8 == 0
1386  && sf->Gshift % 8 == 0
1387  && sf->Bshift % 8 == 0
1388  && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
1389 #ifdef __3dNOW__
1390  if (SDL_Has3DNow())
1391  return BlitRGBtoRGBPixelAlphaMMX3DNOW;
1392 #endif
1393 #ifdef __MMX__
1394  if (SDL_HasMMX())
1395  return BlitRGBtoRGBPixelAlphaMMX;
1396 #endif
1397  }
1398 #endif /* __MMX__ || __3dNOW__ */
1399  if (sf->Amask == 0xff000000) {
1400 #if SDL_ARM_NEON_BLITTERS
1401  if (SDL_HasNEON())
1402  return BlitRGBtoRGBPixelAlphaARMNEON;
1403 #endif
1404 #if SDL_ARM_SIMD_BLITTERS
1405  if (SDL_HasARMSIMD())
1406  return BlitRGBtoRGBPixelAlphaARMSIMD;
1407 #endif
1408  return BlitRGBtoRGBPixelAlpha;
1409  }
1410  }
1411  return BlitNtoNPixelAlpha;
1412 
1413  case 3:
1414  default:
1415  break;
1416  }
1417  return BlitNtoNPixelAlpha;
1418 
1420  if (sf->Amask == 0) {
1421  /* Per-surface alpha blits */
1422  switch (df->BytesPerPixel) {
1423  case 1:
1424  if (df->palette != NULL) {
1425  return BlitNto1SurfaceAlpha;
1426  } else {
1427  /* RGB332 has no palette ! */
1428  return BlitNtoNSurfaceAlpha;
1429  }
1430 
1431  case 2:
1432  if (surface->map->identity) {
1433  if (df->Gmask == 0x7e0) {
1434 #ifdef __MMX__
1435  if (SDL_HasMMX())
1436  return Blit565to565SurfaceAlphaMMX;
1437  else
1438 #endif
1439  return Blit565to565SurfaceAlpha;
1440  } else if (df->Gmask == 0x3e0) {
1441 #ifdef __MMX__
1442  if (SDL_HasMMX())
1443  return Blit555to555SurfaceAlphaMMX;
1444  else
1445 #endif
1446  return Blit555to555SurfaceAlpha;
1447  }
1448  }
1449  return BlitNtoNSurfaceAlpha;
1450 
1451  case 4:
1452  if (sf->Rmask == df->Rmask
1453  && sf->Gmask == df->Gmask
1454  && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1455 #ifdef __MMX__
1456  if (sf->Rshift % 8 == 0
1457  && sf->Gshift % 8 == 0
1458  && sf->Bshift % 8 == 0 && SDL_HasMMX())
1459  return BlitRGBtoRGBSurfaceAlphaMMX;
1460 #endif
1461  if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
1462  return BlitRGBtoRGBSurfaceAlpha;
1463  }
1464  }
1465  return BlitNtoNSurfaceAlpha;
1466 
1467  case 3:
1468  default:
1469  return BlitNtoNSurfaceAlpha;
1470  }
1471  }
1472  break;
1473 
1475  if (sf->Amask == 0) {
1476  if (df->BytesPerPixel == 1) {
1477 
1478  if (df->palette != NULL) {
1479  return BlitNto1SurfaceAlphaKey;
1480  } else {
1481  /* RGB332 has no palette ! */
1482  return BlitNtoNSurfaceAlphaKey;
1483  }
1484  } else {
1485  return BlitNtoNSurfaceAlphaKey;
1486  }
1487  }
1488  break;
1489  }
1490 
1491  return NULL;
1492 }
1493 
1494 #endif /* SDL_HAVE_BLIT_A */
1495 
1496 /* vi: set ts=4 sw=4 expandtab: */
SDL_BlitInfo::src
Uint8 * src
Definition: SDL_blit.h:58
SDL_PixelFormat::Rshift
Uint8 Rshift
Definition: SDL_pixels.h:333
SDL_BlitInfo::src_skip
int src_skip
Definition: SDL_blit.h:61
SDL_PixelFormat::Ashift
Uint8 Ashift
Definition: SDL_pixels.h:336
BlitNto1SurfaceAlphaKey
static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:125
Uint8
uint8_t Uint8
Definition: SDL_stdinc.h:179
ASSEMBLE_RGBA
#define ASSEMBLE_RGBA(buf, bpp, fmt, r, g, b, a)
Definition: SDL_blit.h:403
SDL_PixelFormat::BytesPerPixel
Uint8 BytesPerPixel
Definition: SDL_pixels.h:323
BLEND2x16_50
#define BLEND2x16_50(d, s, mask)
Definition: SDL_blit_A.c:658
Blit16to16SurfaceAlpha128
static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
Definition: SDL_blit_A.c:663
mask
GLenum GLint GLuint mask
Definition: SDL_opengl_glext.h:660
Uint16
uint16_t Uint16
Definition: SDL_stdinc.h:191
SDL_Color::b
Uint8 b
Definition: SDL_pixels.h:302
DUFFS_LOOP4
#define DUFFS_LOOP4(pixel_copy_increment, width)
Definition: SDL_blit.h:489
SDL_Surface
A collection of pixels used in software blitting.
Definition: SDL_surface.h:71
SDL_Has3DNow
#define SDL_Has3DNow
Definition: SDL_dynapi_overrides.h:106
BlitARGBto555PixelAlpha
static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1166
NULL
#define NULL
Definition: begin_code.h:167
surface
EGLSurface surface
Definition: eglext.h:248
SDL_PixelFormat::format
Uint32 format
Definition: SDL_pixels.h:320
SDL_ALPHA_OPAQUE
#define SDL_ALPHA_OPAQUE
Definition: SDL_pixels.h:46
width
GLint GLint GLsizei width
Definition: SDL_opengl.h:1572
BlitNto1SurfaceAlpha
static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:32
RETRIEVE_RGB_PIXEL
#define RETRIEVE_RGB_PIXEL(buf, bpp, Pixel)
Definition: SDL_blit.h:147
SDL_BlitInfo::dst_w
int dst_w
Definition: SDL_blit.h:63
ALPHA_BLEND_RGB
#define ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB)
Definition: SDL_blit.h:446
SDL_BlitInfo
Definition: SDL_blit.h:57
SDL_Color::r
Uint8 r
Definition: SDL_pixels.h:300
SDL_BlitInfo::dst_h
int dst_h
Definition: SDL_blit.h:63
uint16_t
unsigned short uint16_t
Definition: SDL_config_windows.h:61
SDL_COPY_COLORKEY
#define SDL_COPY_COLORKEY
Definition: SDL_blit.h:40
Uint32
uint32_t Uint32
Definition: SDL_stdinc.h:203
SDL_PixelFormat::Rmask
Uint32 Rmask
Definition: SDL_pixels.h:325
h
GLfloat GLfloat GLfloat GLfloat h
Definition: SDL_opengl_glext.h:1949
SDL_COPY_RLE_MASK
#define SDL_COPY_RLE_MASK
Definition: SDL_blit.h:45
DISEMBLE_RGBA
#define DISEMBLE_RGBA(buf, bpp, fmt, Pixel, r, g, b, a)
Definition: SDL_blit.h:354
n
GLdouble n
Definition: SDL_opengl_glext.h:1955
s1
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat s1
Definition: SDL_opengl_glext.h:8586
alpha
GLfloat GLfloat GLfloat alpha
Definition: SDL_opengl_glext.h:415
dst
GLenum GLenum dst
Definition: SDL_opengl_glext.h:1740
SDL_BlitInfo::dst
Uint8 * dst
Definition: SDL_blit.h:62
DUFFS_LOOP
#define DUFFS_LOOP(pixel_copy_increment, width)
Definition: SDL_blit.h:501
SDL_Palette::colors
SDL_Color * colors
Definition: SDL_pixels.h:310
BlitARGBto565PixelAlpha
static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1120
SDL_BlitInfo::src_fmt
SDL_PixelFormat * src_fmt
Definition: SDL_blit.h:66
Blit555to555SurfaceAlpha
static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1081
SDL_Color::g
Uint8 g
Definition: SDL_pixels.h:301
int32_t
signed int int32_t
Definition: SDL_config_windows.h:62
BlitNtoNPixelAlpha
static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1293
BlitNtoNSurfaceAlphaKey
static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1252
SDL_blit.h
height
GLint GLint GLsizei GLsizei height
Definition: SDL_opengl.h:1572
SDL_COPY_BLEND
#define SDL_COPY_BLEND
Definition: SDL_blit.h:36
SDL_BlitInfo::dst_skip
int dst_skip
Definition: SDL_blit.h:65
SDL_BlitInfo::table
Uint8 * table
Definition: SDL_blit.h:68
SDL_PixelFormat::palette
SDL_Palette * palette
Definition: SDL_pixels.h:321
BlitNto1PixelAlpha
static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:79
SDL_PixelFormat::Amask
Uint32 Amask
Definition: SDL_pixels.h:328
SDL_HasNEON
#define SDL_HasNEON
Definition: SDL_dynapi_overrides.h:618
ALPHA_BLEND_RGBA
#define ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA)
Definition: SDL_blit.h:455
SDL_PixelFormat
Definition: SDL_pixels.h:319
SDL_BlitInfo::a
Uint8 a
Definition: SDL_blit.h:71
SDL_PixelFormat::Gmask
Uint32 Gmask
Definition: SDL_pixels.h:326
Blit565to565SurfaceAlpha
static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1042
BlitNtoNSurfaceAlpha
static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:1213
SDL_COPY_MODULATE_ALPHA
#define SDL_COPY_MODULATE_ALPHA
Definition: SDL_blit.h:35
BlitRGBtoRGBSurfaceAlpha128
static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:460
SDL_BlitInfo::dst_fmt
SDL_PixelFormat * dst_fmt
Definition: SDL_blit.h:67
BLEND16_50
#define BLEND16_50(d, s, mask)
Definition: SDL_blit_A.c:654
DISEMBLE_RGB
#define DISEMBLE_RGB(buf, bpp, fmt, Pixel, r, g, b)
Definition: SDL_blit.h:178
SDL_HasMMX
#define SDL_HasMMX
Definition: SDL_dynapi_overrides.h:105
SDL_PixelFormat::Bmask
Uint32 Bmask
Definition: SDL_pixels.h:327
src
GLenum src
Definition: SDL_opengl_glext.h:1740
uint32_t
unsigned int uint32_t
Definition: SDL_config_windows.h:63
SDL_PixelFormat::Bshift
Uint8 Bshift
Definition: SDL_pixels.h:335
s
GLdouble s
Definition: SDL_opengl.h:2063
SDL_PixelFormat::Aloss
Uint8 Aloss
Definition: SDL_pixels.h:332
Uint64
uint64_t Uint64
Definition: SDL_stdinc.h:216
SDL_HasARMSIMD
#define SDL_HasARMSIMD
Definition: SDL_dynapi_overrides.h:730
SDL_BlitInfo::colorkey
Uint32 colorkey
Definition: SDL_blit.h:70
DUFFS_LOOP_124
#define DUFFS_LOOP_124(pixel_copy_increment1, pixel_copy_increment2, pixel_copy_increment4, width)
Definition: SDL_blit.h:505
SDL_CalculateBlitA
SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
Definition: SDL_blit_A.c:1335
SDL_video.h
RGB_FROM_PIXEL
#define RGB_FROM_PIXEL(Pixel, fmt, r, g, b)
Definition: SDL_blit.h:123
SDL_PixelFormat::Gshift
Uint8 Gshift
Definition: SDL_pixels.h:334
BlitRGBtoRGBPixelAlpha
static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:527
BlitRGBtoRGBSurfaceAlpha
static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
Definition: SDL_blit_A.c:485
SDL_BlitFunc
void(* SDL_BlitFunc)(SDL_BlitInfo *info)
Definition: SDL_blit.h:74
d
SDL_PRINTF_FORMAT_STRING const char int SDL_PRINTF_FORMAT_STRING const char int SDL_PRINTF_FORMAT_STRING const char int SDL_PRINTF_FORMAT_STRING const char const char SDL_SCANF_FORMAT_STRING const char return SDL_ThreadFunction const char void return Uint32 return Uint32 SDL_AssertionHandler void SDL_SpinLock SDL_atomic_t int int return SDL_atomic_t return void void void return void return int return SDL_AudioSpec SDL_AudioSpec return int int return return int SDL_RWops int SDL_AudioSpec Uint8 ** d
Definition: SDL_dynapi_procs.h:117
w
GLubyte GLubyte GLubyte GLubyte w
Definition: SDL_opengl_glext.h:734
uintptr_t
unsigned int uintptr_t
Definition: SDL_config_windows.h:70