Subversion Repositories pentevo

Rev

Rev 796 | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed | ?url?

  1. // http://scale2x.sourceforge.net/algorithm.html
  2.  
  3. #include "std.h"
  4.  
  5. #include "emul.h"
  6. #include "vars.h"
  7. #include "draw.h"
  8. #include "dxrend.h"
  9. #include "dxrcopy.h"
  10. #include "dxrframe.h"
  11. #include "dxr_advm.h"
  12.  
  13. inline void line_8_any(unsigned char *dst, unsigned char *src)
  14. {
  15.    if (conf.noflic) line8_nf(dst, src, t.sctab8[0]);
  16.    else line8(dst, src, t.sctab8[0]);
  17. }
  18.  
  19. inline void line_32_any(unsigned char *dst, unsigned char *src)
  20. {
  21.    if (conf.noflic) line32_nf(dst, src, t.sctab32[0]);
  22.    else line32(dst, src, t.sctab32[0]);
  23. }
  24.  
  25. #if 1   // switch between vectorized and branched code
  26.  
  27. #ifdef MOD_SSE2
  28.  
  29. static void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
  30. {
  31.    const unsigned char
  32.       *u = src + ((y-1) & 7)*sc2lines_width,
  33.       *m = src + ((y+0) & 7)*sc2lines_width,
  34.       *l = src + ((y+1) & 7)*sc2lines_width;
  35.  
  36.    for (unsigned i = 0; i < nPix; i += 8) {
  37.  
  38.       __m128i uu = _mm_loadl_epi64((const __m128i*)(u+i));
  39.       __m128i ll = _mm_loadl_epi64((const __m128i*)(l+i));
  40.       __m128i cmp = _mm_cmpeq_epi8(uu, ll);
  41.  
  42.       if (_mm_movemask_epi8(cmp) != 0xFFFF) {
  43.  
  44.          __m128i mm = _mm_loadu_si128((const __m128i*)(m+i-4));
  45.          __m128i uu = _mm_loadu_si128((const __m128i*)(u+i-4));
  46.          __m128i ll = _mm_loadu_si128((const __m128i*)(l+i-4));
  47.  
  48.          __m128i md = _mm_slli_si128(mm,1);
  49.          __m128i mf = _mm_srli_si128(mm,1);
  50.          __m128i maskall = _mm_or_si128(_mm_cmpeq_epi8(md,mf), _mm_cmpeq_epi8(uu,ll));
  51.  
  52.          __m128i e0, e1, v1, v2, v3;
  53.  
  54.          e0 = _mm_cmpeq_epi8(md,uu);
  55.          e0 = _mm_andnot_si128(maskall, e0);
  56.          e0 = _mm_srli_si128(e0,4);
  57.          e0 = _mm_unpacklo_epi8(e0, _mm_setzero_si128());
  58.  
  59.          e1 = _mm_cmpeq_epi8(mf,uu);
  60.          e1 = _mm_andnot_si128(maskall, e1);
  61.          e1 = _mm_srli_si128(e1,4);
  62.          e1 = _mm_unpacklo_epi8(_mm_setzero_si128(), e1);
  63.  
  64.          e0 = _mm_or_si128(e0, e1);
  65.  
  66.          v1 = _mm_srli_si128(mm,4);
  67.          v1 = _mm_unpacklo_epi8(v1,v1);
  68.          v2 = _mm_srli_si128(uu,4);
  69.          v2 = _mm_unpacklo_epi8(v2,v2);
  70.  
  71.          _mm_store_si128((__m128i*)(dst1 + 2*i), _mm_or_si128( _mm_and_si128(e0,v2), _mm_andnot_si128(e0,v1) ) );
  72.  
  73.          e0 = _mm_cmpeq_epi8(md,ll);
  74.          e0 = _mm_andnot_si128(maskall, e0);
  75.          e0 = _mm_srli_si128(e0,4);
  76.          e0 = _mm_unpacklo_epi8(e0, _mm_setzero_si128());
  77.  
  78.          e1 = _mm_cmpeq_epi8(mf,ll);
  79.          e1 = _mm_andnot_si128(maskall, e1);
  80.          e1 = _mm_srli_si128(e1,4);
  81.          e1 = _mm_unpacklo_epi8(_mm_setzero_si128(), e1);
  82.  
  83.          e0 = _mm_or_si128(e0, e1);
  84.  
  85.          v3 = _mm_srli_si128(ll,4);
  86.          v3 = _mm_unpacklo_epi8(v3,v3);
  87.  
  88.          _mm_store_si128((__m128i*)(dst2 + 2*i), _mm_or_si128( _mm_and_si128(e0,v3), _mm_andnot_si128(e0,v1) ) );
  89.  
  90.       } else {
  91.          __m128i v1 = _mm_loadl_epi64((const __m128i*)(m + i));
  92.          v1 = _mm_unpacklo_epi8(v1,v1);
  93.          _mm_store_si128((__m128i*)(dst1 + 2*i), v1);
  94.          _mm_store_si128((__m128i*)(dst2 + 2*i), v1);
  95.       }
  96.    }
  97. }
  98.  
  99. #else // MMX vectorized
  100.  
  101. void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
  102. {
  103.    const unsigned char
  104.       *u = src + ((y-1) & 7)*sc2lines_width,
  105.       *m = src + ((y+0) & 7)*sc2lines_width,
  106.       *l = src + ((y+1) & 7)*sc2lines_width;
  107.  
  108.    for (unsigned i = 0; i < nPix; i += 4) {
  109.  
  110.       if (*(unsigned*)(u+i) ^ *(unsigned*)(l+i)) {
  111.  
  112.          __m64 mm = *(__m64*)(m+i-2);
  113.          __m64 uu = *(__m64*)(u+i-2);
  114.          __m64 ll = *(__m64*)(l+i-2);
  115.          __m64 md = _mm_slli_si64(mm,8);
  116.          __m64 mf = _mm_srli_si64(mm,8);
  117.          __m64 maskall = _mm_or_si64(_mm_cmpeq_pi8(md,mf), _mm_cmpeq_pi8(uu,ll));
  118.  
  119.          __m64 e0, e1, v1, v2;
  120.  
  121.          e0 = _mm_cmpeq_pi8(md,uu);
  122.          e0 = _mm_andnot_si64(maskall, e0);
  123.          e0 = _mm_srli_si64(e0,16);
  124.          e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64());
  125.  
  126.          e1 = _mm_cmpeq_pi8(mf,uu);
  127.          e1 = _mm_andnot_si64(maskall, e1);
  128.          e1 = _mm_srli_si64(e1,16);
  129.          e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1);
  130.  
  131.          e0 = _mm_or_si64(e0, e1);
  132.  
  133.          v1 = _m_from_int(*(unsigned*)(m+i));
  134.          v2 = _m_from_int(*(unsigned*)(u+i));
  135.          v1 = _mm_unpacklo_pi8(v1,v1);
  136.          v2 = _mm_unpacklo_pi8(v2,v2);
  137.  
  138.          *(__m64*)(dst1 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) );
  139.  
  140.          e0 = _mm_cmpeq_pi8(md,ll);
  141.          e0 = _mm_andnot_si64(maskall, e0);
  142.          e0 = _mm_srli_si64(e0,16);
  143.          e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64());
  144.  
  145.          e1 = _mm_cmpeq_pi8(mf,ll);
  146.          e1 = _mm_andnot_si64(maskall, e1);
  147.          e1 = _mm_srli_si64(e1,16);
  148.          e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1);
  149.  
  150.          e0 = _mm_or_si64(e0, e1);
  151.  
  152.          v1 = _m_from_int(*(unsigned*)(m+i));
  153.          v2 = _m_from_int(*(unsigned*)(l+i));
  154.          v1 = _mm_unpacklo_pi8(v1,v1);
  155.          v2 = _mm_unpacklo_pi8(v2,v2);
  156.  
  157.          *(__m64*)(dst2 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) );
  158.  
  159.       } else {
  160.  
  161.          __m64 v1 = _m_from_int(*(unsigned*)(m+i));
  162.          v1 = _mm_unpacklo_pi8(v1,v1);
  163.          *(__m64*)(dst1 + 2*i) = v1;
  164.          *(__m64*)(dst2 + 2*i) = v1;
  165.  
  166.       }
  167.  
  168.    }
  169. }
  170.  
  171. #endif // SSE2
  172.  
  173. #else // MMX branched
  174. // src       dst
  175. // ABC       e0e1
  176. // DEF       e2e3
  177. // GHI
  178.  
  179. /*
  180. if(B != H && D != F)
  181. {                               E0 = E;
  182.     E0 = D == B ? D : E;        E1 = E;
  183.     E1 = B == F ? F : E;        E2 = E;
  184.     E2 = D == H ? D : E;        E3 = E;
  185.     E3 = H == F ? F : E;        if(B != H) continue;
  186. }                          =>   if(D != F)
  187. else                            {
  188. {                                   E0 = D == B ? D : E;
  189.     E0 = E;                         E1 = B == F ? F : E;
  190.     E1 = E;                         E2 = D == H ? D : E;
  191.     E2 = E;                         E3 = H == F ? F : E;
  192.     E3 = E;                     }
  193. }
  194. */
  195.  
  196. void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
  197. {
  198.    const unsigned char
  199.       *u = src + ((y-1) & 7)*sc2lines_width,
  200.       *m = src + ((y+0) & 7)*sc2lines_width,
  201.       *l = src + ((y+1) & 7)*sc2lines_width;
  202.  
  203.    // process 4pix per iteration
  204.    for (unsigned i = 0; i < nPix; i += 4)
  205.    {
  206.       unsigned dw = *(unsigned*)(m+i);
  207.       __m64 v1 = _mm_cvtsi32_si64(dw); // v1   =     0|    0|    0|    0|dw[3]|dw[2]|dw[1]|dw[0]
  208.       v1 = _mm_unpacklo_pi8(v1,v1);    // v1   = dw[3]|dw[3]|dw[2]|dw[2]|dw[1]|dw[1]|dw[0]|dw[0]
  209.       *(__m64*)(dst1 + 2*i) = v1;      // e0e1 = dw[3]|dw[3]|dw[2]|dw[2]|dw[1]|dw[1]|dw[0]|dw[0]
  210.       *(__m64*)(dst2 + 2*i) = v1;      // e2e3 = dw[3]|dw[3]|dw[2]|dw[2]|dw[1]|dw[1]|dw[0]|dw[0]
  211.  
  212.       dw = *(unsigned*)(u+i) ^ *(unsigned*)(l+i);
  213.       if (!dw)
  214.           continue; // u == l
  215.  
  216.    #define process_pix(n)                                       \
  217.       if ((dw & (0xFF << (8*n))) && m[i+n-1] != m[i+n+1])       \
  218.       {                                                         \
  219.          if (u[i+n] == m[i+n-1])                                \
  220.              dst1[2*(i+n)] = u[i+n];                            \
  221.          if (u[i+n] == m[i+n+1])                                \
  222.              dst1[2*(i+n)+1] = u[i+n];                          \
  223.          if (l[i+n] == m[i+n-1])                                \
  224.              dst2[2*(i+n)] = l[i+n];                            \
  225.          if (l[i+n] == m[i+n+1])                                \
  226.              dst2[2*(i+n)+1] = l[i+n];                          \
  227.       }
  228.  
  229.       process_pix(0);
  230.       process_pix(1);
  231.       process_pix(2);
  232.       process_pix(3);
  233.    #undef process_pix
  234.    }
  235. }
  236.  
  237. #endif // MMX branched
  238.  
  239. static void lines_scale2_32(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
  240. {
  241.    const u32 *s = (const u32 *)src;
  242.    const u32 *u = s + ((y-1) & 7)*sc2lines_width; // upper
  243.    const u32 *m = s + ((y+0) & 7)*sc2lines_width; // middle
  244.    const u32 *l = s + ((y+1) & 7)*sc2lines_width; // lower
  245.    u32 *d1 = (u32 *)dst1;
  246.    u32 *d2 = (u32 *)dst2;
  247.  
  248.    // first pixel (left)
  249.    d1[0] = d1[0 + 1] = d2[0] = d2[0 + 1] = m[0];
  250.  
  251.    if(u[0] != l[0] && m[0] != m[0 + 1])
  252.    {
  253.        if(u[0] == m[0])
  254.            d1[0] = u[0];
  255.        if(u[0] == m[0 + 1])
  256.            d1[0 + 1] = u[0];
  257.        if(l[0] == m[0])
  258.            d2[0] = l[0];
  259.        if(l[0] == m[0 + 1])
  260.            d2[0 + 1] = l[0];
  261.    }
  262.  
  263.    // central pixels
  264.    for (unsigned i = 1; i < nPix - 1; i++)
  265.    {
  266.       d1[2*i] = d1[2*i+1] = d2[2*i] = d2[2*i+1] = m[i];
  267.  
  268.       if (u[i] != l[i] && m[i-1] != m[i+1])
  269.       {
  270.          if (u[i] == m[i-1])
  271.              d1[2*i] = u[i];
  272.          if (u[i] == m[i+1])
  273.              d1[2*i+1] = u[i];
  274.          if (l[i] == m[i-1])
  275.              d2[2*i] = l[i];
  276.          if (l[i] == m[i+1])
  277.              d2[2*i+1] = l[i];
  278.       }
  279.    }
  280.  
  281.    // last pixel (right)
  282.    const unsigned i = (nPix - 1);
  283.    d1[2 * i] = d1[2 * i + 1] = d2[2 * i] = d2[2 * i + 1] = m[i];
  284.  
  285.    if(u[i] != l[i] && m[i - 1] != m[i])
  286.    {
  287.        if(u[i] == m[i - 1])
  288.            d1[2 * i] = u[i];
  289.        if(u[i] == m[i])
  290.            d1[2 * i + 1] = u[i];
  291.        if(l[i] == m[i - 1])
  292.            d2[2 * i] = l[i];
  293.        if(l[i] == m[i])
  294.            d2[2 * i + 1] = l[i];
  295.    }
  296. }
  297.  
  298. // 8bpp
  299. static void render_scale2(unsigned char *dst, unsigned pitch)
  300. {
  301.    unsigned char *src = rbuf; unsigned delta = temp.scx/4;
  302.    line_8_any(t.scale2buf[0], src);
  303.    // assume 'above' screen line same as line 0
  304.    memcpy(t.scale2buf[(0-1) & 7], t.scale2buf[0], temp.scx);
  305.    for (unsigned y = 0; y < temp.scy; y++)
  306.    {
  307.       src += delta;
  308.       line_8_any(t.scale2buf[(y+1) & 7], src);
  309.       lines_scale2(t.scale2buf[0], y, dst, dst+pitch, temp.scx);
  310.       dst += 2*pitch;
  311.    }
  312. }
  313.  
  314. // 32bpp
  315. static void render_scale2_32(unsigned char *dst, unsigned pitch)
  316. {
  317.    unsigned char *src = rbuf;
  318.    unsigned delta = temp.scx/4;
  319.    line_32_any((u8 *)t.scale2buf32[0], src);
  320.  
  321.    // assume 'above' screen line same as line 0
  322.    memcpy(t.scale2buf32[(0-1) & 7], t.scale2buf32[0], temp.scx);
  323.    for (unsigned y = 0; y < temp.scy; y++)
  324.    {
  325.       src += delta;
  326.       line_32_any((u8 *)t.scale2buf32[(y+1) & 7], src);
  327.       lines_scale2_32((u8 *)t.scale2buf32[0], y, dst, dst+pitch, temp.scx);
  328.       dst += 2*pitch;
  329.    }
  330. }
  331.  
  332. // MMX-vectorized version is not ready yet :(
  333. // 8bpp
  334. static void lines_scale3(unsigned y, unsigned char *dst, unsigned pitch)
  335. {
  336.  
  337.    const unsigned char
  338.       *u = t.scale2buf[(y-1) & 3],
  339.       *m = t.scale2buf[(y+0) & 3],
  340.       *l = t.scale2buf[(y+1) & 3];
  341.  
  342.    for (unsigned i = 0; i < temp.scx; i += 4)
  343.    {
  344.       unsigned char c;
  345.  
  346.       c = m[i];
  347.       dst[3*i+0+0*pitch+ 0] = dst[3*i+1+0*pitch+ 0] = dst[3*i+2+0*pitch+ 0] = c;
  348.       dst[3*i+0+1*pitch+ 0] = dst[3*i+1+1*pitch+ 0] = dst[3*i+2+1*pitch+ 0] = c;
  349.       dst[3*i+0+2*pitch+ 0] = dst[3*i+1+2*pitch+ 0] = dst[3*i+2+2*pitch+ 0] = c;
  350.  
  351.       c = m[i+1];
  352.       dst[3*i+0+0*pitch+ 3] = dst[3*i+1+0*pitch+ 3] = dst[3*i+2+0*pitch+ 3] = c;
  353.       dst[3*i+0+1*pitch+ 3] = dst[3*i+1+1*pitch+ 3] = dst[3*i+2+1*pitch+ 3] = c;
  354.       dst[3*i+0+2*pitch+ 3] = dst[3*i+1+2*pitch+ 3] = dst[3*i+2+2*pitch+ 3] = c;
  355.  
  356.       c = m[i+2];
  357.       dst[3*i+0+0*pitch+ 6] = dst[3*i+1+0*pitch+ 6] = dst[3*i+2+0*pitch+ 6] = c;
  358.       dst[3*i+0+1*pitch+ 6] = dst[3*i+1+1*pitch+ 6] = dst[3*i+2+1*pitch+ 6] = c;
  359.       dst[3*i+0+2*pitch+ 6] = dst[3*i+1+2*pitch+ 6] = dst[3*i+2+2*pitch+ 6] = c;
  360.  
  361.       c = m[i+3];
  362.       dst[3*i+0+0*pitch+ 9] = dst[3*i+1+0*pitch+ 9] = dst[3*i+2+0*pitch+ 9] = c;
  363.       dst[3*i+0+1*pitch+ 9] = dst[3*i+1+1*pitch+ 9] = dst[3*i+2+1*pitch+ 9] = c;
  364.       dst[3*i+0+2*pitch+ 9] = dst[3*i+1+2*pitch+ 9] = dst[3*i+2+2*pitch+ 9] = c;
  365.  
  366.       unsigned dw = *(const unsigned*)(u+i) ^ *(const unsigned*)(l+i);
  367.       if (!dw) continue;
  368.  
  369.    #define process_pix(n)                                                                              \
  370.       if ((dw & (0xFFU << (8U*n))) && m[i+n-1] != m[i+n+1])                                              \
  371.       {                                                                                                \
  372.          if (u[i+n] == m[i+n-1])                                                                       \
  373.              dst[0*pitch+3*(i+n)] = u[i+n];                                                            \
  374.          if ((u[i+n] == m[i+n-1] && m[i+n] != u[i+n+1]) || (u[i+n] == m[i+n+1] && m[i+n] != u[i+n-1])) \
  375.              dst[0*pitch+3*(i+n)+1] = u[i+n];                                                          \
  376.          if (u[i+n] == m[i+n+1])                                                                       \
  377.              dst[0*pitch+3*(i+n)+2] = u[i+n];                                                          \
  378.          if ((u[i+n] == m[i+n-1] && m[i+n] != l[i+n-1]) || (l[i+n] == m[i+n-1] && m[i+n] != u[i+n-1])) \
  379.              dst[1*pitch+3*(i+n)+0] = m[i+n-1];                                                        \
  380.          if ((u[i+n] == m[i+n+1] && m[i+n] != l[i+n+1]) || (l[i+n] == m[i+n+1] && m[i+n] != u[i+n+1])) \
  381.              dst[1*pitch+3*(i+n)+2] = m[i+n+1];                                                        \
  382.          if (l[i+n] == m[i+n-1])                                                                       \
  383.              dst[2*pitch+3*(i+n)] = l[i+n];                                                            \
  384.          if ((l[i+n] == m[i+n-1] && m[i+n] != l[i+n+1]) || (l[i+n] == m[i+n+1] && m[i+n] != l[i+n-1])) \
  385.              dst[2*pitch+3*(i+n)+1] = l[i+n];                                                          \
  386.          if (l[i+n] == m[i+n+1])                                                                       \
  387.              dst[2*pitch+3*(i+n)+2] = l[i+n];                                                          \
  388.       }
  389.  
  390.       process_pix(0);
  391.       process_pix(1);
  392.       process_pix(2);
  393.       process_pix(3);
  394.    #undef process_pix
  395.    }
  396. }
  397.  
  398. // 8bpp
  399. static void render_scale3(unsigned char *dst, unsigned pitch)
  400. {
  401.    unsigned char *src = rbuf; unsigned delta = temp.scx/4;
  402.    line_8_any(t.scale2buf[0], src);
  403.    // assume 'above' screen line same as line 0
  404.    memcpy(t.scale2buf[(0-1) & 3], t.scale2buf[0], temp.scx);
  405.    for (unsigned y = 0; y < temp.scy; y++) {
  406.       src += delta;
  407.       line_8_any(t.scale2buf[(y+1) & 3], src);
  408.       lines_scale3(y, dst, pitch);
  409.       dst += 3*pitch;
  410.    }
  411. }
  412.  
  413. // 32bpp
  414. static void lines_scale3_32(unsigned y, unsigned char *dst, unsigned pitch)
  415. {
  416.    const u32 *u = t.scale2buf32[(y-1) & 3]; // upper
  417.    const u32 *m = t.scale2buf32[(y+0) & 3]; // middle
  418.    const u32 *l = t.scale2buf32[(y+1) & 3]; // lower
  419.    u32 *d = (u32 *)dst;
  420.    pitch /= sizeof(u32);
  421.  
  422.    // first pixel (left)
  423.    d[0 * pitch + 3 * 0 + 0] = d[0 * pitch + 3 * 0 + 1] = d[0 * pitch + 3 * 0 + 2] = m[0];
  424.    d[1 * pitch + 3 * 0 + 0] = d[1 * pitch + 3 * 0 + 1] = d[1 * pitch + 3 * 0 + 2] = m[0];
  425.    d[2 * pitch + 3 * 0 + 0] = d[2 * pitch + 3 * 0 + 1] = d[2 * pitch + 3 * 0 + 2] = m[0];
  426.  
  427.    if(u[0] != l[0] && m[0] != m[0 + 1])
  428.    {
  429.        if(u[0] == m[0])
  430.            d[0 * pitch + 3 * 0 + 0] = u[0];
  431.        if((u[0] == m[0] && m[0] != u[0 + 1]) || (u[0] == m[0 + 1] && m[0] != u[0]))
  432.            d[0 * pitch + 3 * 0 + 1] = u[0];
  433.        if(u[0] == m[0 + 1])
  434.            d[0 * pitch + 3 * 0 + 2] = u[0];
  435.        if((u[0] == m[0] && m[0] != l[0]) || (l[0] == m[0] && m[0] != u[0]))
  436.            d[1 * pitch + 3 * 0 + 0] = m[0];
  437.        if((u[0] == m[0 + 1] && m[0] != l[0 + 1]) || (l[0] == m[0 + 1] && m[0] != u[0 + 1]))
  438.            d[1 * pitch + 3 * 0 + 2] = m[0 + 1];
  439.        if(l[0] == m[0])
  440.            d[2 * pitch + 3 * 0 + 0] = l[0];
  441.        if((l[0] == m[0] && m[0] != l[0 + 1]) || (l[0] == m[0 + 1] && m[0] != l[0]))
  442.            d[2 * pitch + 3 * 0 + 1] = l[0];
  443.        if(l[0] == m[0 + 1])
  444.            d[2 * pitch + 3 * 0 + 2] = l[0];
  445.    }
  446.  
  447.    // central pixels
  448.    for (unsigned i = 1; i < temp.scx - 1; i++)
  449.    {
  450.       d[0*pitch+3*i+0] = d[0*pitch+3*i+1] = d[0*pitch+3*i+2] = m[i];
  451.       d[1*pitch+3*i+0] = d[1*pitch+3*i+1] = d[1*pitch+3*i+2] = m[i];
  452.       d[2*pitch+3*i+0] = d[2*pitch+3*i+1] = d[2*pitch+3*i+2] = m[i];
  453.  
  454.       if (u[i] != l[i] && m[i-1] != m[i+1])
  455.       {
  456.          if (u[i] == m[i-1])
  457.              d[0*pitch+3*i+0] = u[i];
  458.          if ((u[i] == m[i-1] && m[i] != u[i+1]) || (u[i] == m[i+1] && m[i] != u[i-1]))
  459.              d[0*pitch+3*i+1] = u[i];
  460.          if (u[i] == m[i+1])
  461.              d[0*pitch+3*i+2] = u[i];
  462.          if ((u[i] == m[i-1] && m[i] != l[i-1]) || (l[i] == m[i-1] && m[i] != u[i-1]))
  463.              d[1*pitch+3*i+0] = m[i-1];
  464.          if ((u[i] == m[i+1] && m[i] != l[i+1]) || (l[i] == m[i+1] && m[i] != u[i+1]))
  465.              d[1*pitch+3*i+2] = m[i+1];
  466.          if (l[i] == m[i-1])
  467.              d[2*pitch+3*i+0] = l[i];
  468.          if ((l[i] == m[i-1] && m[i] != l[i+1]) || (l[i] == m[i+1] && m[i] != l[i-1]))
  469.              d[2*pitch+3*i+1] = l[i];
  470.          if (l[i] == m[i+1])
  471.              d[2*pitch+3*i+2] = l[i];
  472.       }
  473.    }
  474.  
  475.    // last pixel (right)
  476.    const unsigned i = temp.scx - 1;
  477.    d[0 * pitch + 3 * i + 0] = d[0 * pitch + 3 * i + 1] = d[0 * pitch + 3 * i + 2] = m[i];
  478.    d[1 * pitch + 3 * i + 0] = d[1 * pitch + 3 * i + 1] = d[1 * pitch + 3 * i + 2] = m[i];
  479.    d[2 * pitch + 3 * i + 0] = d[2 * pitch + 3 * i + 1] = d[2 * pitch + 3 * i + 2] = m[i];
  480.  
  481.    if(u[i] != l[i] && m[i - 1] != m[i])
  482.    {
  483.        if(u[i] == m[i])
  484.            d[0 * pitch + 3 * i + 0] = u[i];
  485.        if((u[i] == m[i - 1] && m[i] != u[i]) || (u[i] == m[i] && m[i] != u[i - 1]))
  486.            d[0 * pitch + 3 * i + 1] = u[i];
  487.        if(u[i] == m[i])
  488.            d[0 * pitch + 3 * i + 2] = u[i];
  489.        if((u[i] == m[i - 1] && m[i] != l[i - 1]) || (l[i] == m[i - 1] && m[i] != u[i - 1]))
  490.            d[1 * pitch + 3 * i + 0] = m[i - 1];
  491.        if((u[i] == m[i] && m[i] != l[i]) || (l[i] == m[i] && m[i] != u[i]))
  492.            d[1 * pitch + 3 * i + 2] = m[i];
  493.        if(l[i] == m[i - 1])
  494.            d[2 * pitch + 3 * i + 0] = l[i];
  495.        if((l[i] == m[i - 1] && m[i] != l[i]) || (l[i] == m[i] && m[i] != l[i - 1]))
  496.            d[2 * pitch + 3 * i + 1] = l[i];
  497.        if(l[i] == m[i])
  498.            d[2 * pitch + 3 * i + 2] = l[i];
  499.    }
  500. }
  501.  
  502. // 32bpp
  503. static void render_scale3_32(unsigned char *dst, unsigned pitch)
  504. {
  505.    unsigned char *src = rbuf; unsigned delta = temp.scx/4;
  506.    line_32_any((u8 *)t.scale2buf32[0], src);
  507.    // assume 'above' screen line same as line 0
  508.    memcpy(t.scale2buf32[(0-1) & 3], t.scale2buf32[0], temp.scx);
  509.    for (unsigned y = 0; y < temp.scy; y++)
  510.    {
  511.       src += delta;
  512.       line_32_any((u8 *)t.scale2buf32[(y+1) & 3], src);
  513.       lines_scale3_32(y, dst, pitch);
  514.       dst += 3*pitch;
  515.    }
  516. }
  517.  
  518. static void render_scale4(unsigned char *dst, unsigned pitch)
  519. {
  520.    unsigned char *src = rbuf; unsigned delta = temp.scx/4;
  521.  
  522.    line_8_any(t.scale2buf[0], src); src += delta;
  523.    line_8_any(t.scale2buf[1], src); src += delta;
  524.    // assume 'above' screen line same as line 0
  525.    memcpy(t.scale2buf[(0-1) & 7], t.scale2buf[0], temp.scx);
  526.    lines_scale2(t.scale2buf[0], 0, t.scale4buf[0], t.scale4buf[1], temp.scx);
  527.  
  528.    for (unsigned y = 0; y < temp.scy; y++) {
  529.  
  530.       line_8_any(t.scale2buf[(y+2) & 7], src); src += delta;
  531.  
  532.       unsigned char *dst1 = t.scale4buf[(2*y+2) & 7];
  533.       unsigned char *dst2 = t.scale4buf[(2*y+3) & 7];
  534.       lines_scale2(t.scale2buf[0], y+1, dst1, dst2, temp.scx);
  535.  
  536.       lines_scale2(t.scale4buf[0], 2*y,   dst+0*pitch, dst+1*pitch, temp.scx*2);
  537.       lines_scale2(t.scale4buf[0], 2*y+1, dst+2*pitch, dst+3*pitch, temp.scx*2);
  538.  
  539.       dst += 4*pitch;
  540.    }
  541. }
  542.  
  543. void __fastcall render_advmame(unsigned char *dst, unsigned pitch)
  544. {
  545.    switch (conf.videoscale)
  546.    {
  547.       case 2:
  548.           if(temp.obpp == 8) render_scale2(dst, pitch);
  549.           else if(temp.obpp == 32) render_scale2_32(dst, pitch);
  550.       break;
  551.       case 3:
  552.           if(temp.obpp == 8) render_scale3(dst, pitch);
  553.           else if(temp.obpp == 32) render_scale3_32(dst, pitch);
  554.       break;
  555.       case 4: render_scale4(dst, pitch); break;
  556.       default: render_small(dst, pitch); return; // skip noflic test
  557.    }
  558.    if (conf.noflic)
  559.        memcpy(rbuf_s, rbuf, temp.scy*temp.scx/4);
  560. #ifndef MOD_SSE2
  561.    _mm_empty();
  562. #endif
  563. }
  564.