#include "std.h"
#include "emul.h"
#include "vars.h"
#include "draw.h"
#include "dxr_rsm.h"
RSM_DATA rsm;
void RSM_DATA::prepare_line_32(unsigned char *src0)
{
unsigned i;
for(i = 0; i < line_size_d; i++)
{
line_buffer_d[i] = bias;
}
unsigned line_size = line_size_d / (sizeof(__m_vec) / sizeof(DWORD)), frame = 0;
const __m_vec *tab = colortab;
for(;;)
{
unsigned char *src = src0;
for(i = 0; i < line_size; )
{
#ifdef MOD_SSE2
// ┬√тюф 16 Єюўхъ, ърцф√щ _mm_add_epi8 юсЁрсрЄ√трхЄ 4 Єюўъш чр Ёрч
// s=[ap|ap]
unsigned s = *(unsigned*)src, attr = (s >> 4) & 0xFF0;
line_buffer[i + 0] = _mm_add_epi8(line_buffer[i + 0], tab[((s >> 4) & 0xF) + attr]);
line_buffer[i + 1] = _mm_add_epi8(line_buffer[i + 1], tab[((s >> 0) & 0xF) + attr]);
attr = (s >> 20) & 0xFF0;
line_buffer[i + 2] = _mm_add_epi8(line_buffer[i + 2], tab[((s >> 20) & 0xF) + attr]);
line_buffer[i + 3] = _mm_add_epi8(line_buffer[i + 3], tab[((s >> 16) & 0xF) + attr]);
i += 4;
src += 4;
#else
unsigned s = *(unsigned*)src, attr = (s >> 6) & 0x3FC;
line_buffer[i + 0] = _mm_add_pi8(line_buffer[i + 0], tab[((s >> 6) & 3) + attr]);
line_buffer[i + 1] = _mm_add_pi8(line_buffer[i + 1], tab[((s >> 4) & 3) + attr]);
line_buffer[i + 2] = _mm_add_pi8(line_buffer[i + 2], tab[((s >> 2) & 3) + attr]);
line_buffer[i + 3] = _mm_add_pi8(line_buffer[i + 3], tab[((s >> 0) & 3) + attr]);
attr = (s >> 22) & 0x3FC;
line_buffer[i + 4] = _mm_add_pi8(line_buffer[i + 4], tab[((s >> 22) & 3) + attr]);
line_buffer[i + 5] = _mm_add_pi8(line_buffer[i + 5], tab[((s >> 20) & 3) + attr]);
line_buffer[i + 6] = _mm_add_pi8(line_buffer[i + 6], tab[((s >> 18) & 3) + attr]);
line_buffer[i + 7] = _mm_add_pi8(line_buffer[i + 7], tab[((s >> 16) & 3) + attr]);
i += 8;
src += 4;
#endif // MOD_SSE2
}
if(++frame == mix_frames)
{
break;
}
src0 += rb2_offs;
if(src0 >= rbuf_s + rb2_offs * mix_frames)
{
src0 -= rb2_offs * mix_frames;
}
tab += 0x100 * (1 << (sizeof(__m_vec) / sizeof(DWORD)));
}
}
void RSM_DATA::prepare_line_16(unsigned char *src0)
{
unsigned i;
for(i = 0; i < line_size_d; i++)
{
line_buffer_d[i] = bias;
}
unsigned line_size = line_size_d / 2, frame = 0;
const __m_vec *tab = colortab;
for(;;)
{
unsigned char *src = src0;
for(i = 0; i < line_size; )
{
#ifdef MOD_SSE2
// ═х Ёхрышчютрээю
src += 4; i += 4;
#else
unsigned s = *(unsigned*)src, attr = (s >> 4) & 0xFF0;
line_buffer[i + 0] = _mm_add_pi8(line_buffer[i + 0], tab[((s >> 4) & 0xF) + attr]);
line_buffer[i + 1] = _mm_add_pi8(line_buffer[i + 1], tab[((s >> 0) & 0xF) + attr]);
attr = (s >> 20) & 0xFF0;
line_buffer[i + 2] = _mm_add_pi8(line_buffer[i + 2], tab[((s >> 20) & 0xF) + attr]);
line_buffer[i + 3] = _mm_add_pi8(line_buffer[i + 3], tab[((s >> 16) & 0xF) + attr]);
src += 4; i += 4;
#endif // MOD_SSE2
}
if(++frame == mix_frames)
{
break;
}
src0 += rb2_offs;
if(src0 >= rbuf_s + rb2_offs * mix_frames)
{
src0 -= rb2_offs * mix_frames;
}
tab += 0x100 * 16;
}
}
void RSM_DATA::prepare_line_8(unsigned char *src0)
{
unsigned i;
for(i = 0; i < line_size_d; i++)
{
line_buffer_d[i] = bias;
}
unsigned frame = 0;
const DWORD *tab = (const DWORD*)colortab;
for(;;)
{
unsigned char *src = src0;
for(i = 0; i < line_size_d; )
{
unsigned s = *(unsigned*)src, attr = (s >> 4) & 0xFF0;
line_buffer_d[i + 0] += tab[((s >> 4) & 0xF) + attr];
line_buffer_d[i + 1] += tab[((s >> 0) & 0xF) + attr];
attr = (s >> 20) & 0xFF0;
line_buffer_d[i + 2] += tab[((s >> 20) & 0xF) + attr];
line_buffer_d[i + 3] += tab[((s >> 16) & 0xF) + attr];
s = *(unsigned*)(src + 4);
attr = (s >> 4) & 0xFF0;
line_buffer_d[i + 4] += tab[((s >> 4) & 0xF) + attr];
line_buffer_d[i + 5] += tab[((s >> 0) & 0xF) + attr];
attr = (s >> 20) & 0xFF0;
line_buffer_d[i + 6] += tab[((s >> 20) & 0xF) + attr];
line_buffer_d[i + 7] += tab[((s >> 16) & 0xF) + attr];
src += 8;
i += 8;
}
if(++frame == mix_frames)
{
break;
}
src0 += rb2_offs;
if(src0 >= rbuf_s + rb2_offs * mix_frames)
{
src0 -= rb2_offs * mix_frames;
}
tab += 0x100 * 16;
}
}
static void rend_rsm_8(unsigned char *dst, unsigned pitch, unsigned char *src)
{
unsigned delta = temp.scx / 4;
for(unsigned y = 0; y < temp.scy; y++)
{
rsm.prepare_line_8(src);
for(unsigned i = 0; i < rsm.line_size_d; i++)
{
*(unsigned*)(dst + i * 4) = rsm.line_buffer_d[i];
}
dst += pitch; src += delta;
}
}
static void rend_rsm_16(unsigned char *dst, unsigned pitch, unsigned char *src)
{
unsigned delta = temp.scx / 4;
for(unsigned y = 0; y < temp.scy; y++)
{
rsm.prepare_line_32(src);
// pack truecolor pixel to 16 bit
if(temp.hi15 == 0)
{
for(unsigned i = 0; i < rsm.line_size_d; i += 2)
{
unsigned c1 = rsm.line_buffer_d[i];
unsigned c2 = rsm.line_buffer_d[i + 1];
*(unsigned*)(dst + i * 2) =
((c1 >> 3) & 0x1F) + ((c1 >> 5) & 0x07E0) + ((c1 >> 8) & 0xF800) +
((c2 << 13) & 0x1F0000) + ((c2 << 11) & 0x07E00000) + ((c2 << 8) & 0xF8000000);
}
}
else /* if (temp.hi15 == 1) */
{
for(unsigned i = 0; i < rsm.line_size_d; i += 2)
{
unsigned c1 = rsm.line_buffer_d[i];
unsigned c2 = rsm.line_buffer_d[i + 1];
*(unsigned*)(dst + i * 2) =
((c1 >> 3) & 0x1F) + ((c1 >> 6) & 0x03E0) + ((c1 >> 9) & 0x7C00) +
((c2 << 13) & 0x1F0000) + ((c2 << 10) & 0x03E00000) + ((c2 << 7) & 0x7C000000);
}
}
dst += pitch; src += delta;
}
}
static void rend_rsm_16o(unsigned char *dst, unsigned pitch, unsigned char *src)
{
unsigned delta = temp.scx / 4;
for(unsigned y = 0; y < temp.scy; y++)
{
rsm.prepare_line_16(src);
for(unsigned i = 0; i < rsm.line_size_d; i++)
{
*(unsigned*)(dst + i * 4) = rsm.line_buffer_d[i];
}
dst += pitch; src += delta;
}
}
static void rend_rsm_32(unsigned char *dst, unsigned pitch, unsigned char *src)
{
unsigned delta = temp.scx / 4;
for(unsigned y = 0; y < temp.scy; y++)
{
rsm.prepare_line_32(src);
for(unsigned i = 0; i < rsm.line_size_d; i++)
{
*(unsigned*)(dst + i * 4) = rsm.line_buffer_d[i];
}
dst += pitch; src += delta;
}
}
void __fastcall render_rsm(unsigned char *dst, unsigned pitch)
{
rsm.colortab = (const RSM_DATA::__m_vec*)(((const u8 *)rsm.tables) + rsm.frame * rsm.frame_table_size);
unsigned char *src = rbuf_s + rb2_offs * rsm.rbuf_dst;
if(temp.obpp == 8)
{
rend_rsm_8(dst, pitch, src);
}
if(temp.obpp == 16)
{
if(rsm.mode == 0)
{
rend_rsm_16(dst, pitch, src);
}
else
{
rend_rsm_16o(dst, pitch, src);
}
}
if(temp.obpp == 32)
{
rend_rsm_32(dst, pitch, src);
}
#ifndef MOD_SSE2
_mm_empty(); // EMMS
#endif // MOD_SSE2
}
static unsigned gcd(unsigned x, unsigned y)
{
while(x != y)
{
if(x > y)
{
x -= y;
}
else
{
y -= x;
}
}
return x;
}
static unsigned lcm(unsigned x, unsigned y)
{
return x * y / gcd(x, y);
}
void calc_rsm_tables()
{
rsm.rbuf_dst = rsm.frame = 0;
if(renders[conf.render].func != render_rsm)
{
rsm.mix_frames = rsm.period = 1;
static unsigned char one = 1;
rsm.needframes = &one; // rsm.needframes[0]=1
return;
}
rsm.mode = (temp.obpp == 8) ? 2 : 0;
if(temp.obpp == 16 && temp.hi15 == 2)
{
rsm.mode = 1;
}
rsm.line_size_d = (temp.scx >> rsm.mode);
unsigned fmax = lcm(conf.intfq, temp.ofq);
rsm.period = fmax / conf.intfq;
unsigned step = fmax / temp.ofq;
rsm.mix_frames = (conf.rsm.mode == RSM_SIMPLE) ? 2 : conf.rsm.mix_frames;
rsm.frame_table_size = rsm.mix_frames * 0x100;
if(rsm.mode == 0)
{
rsm.frame_table_size *= (1 << (sizeof(RSM_DATA::__m_vec) / sizeof(DWORD))) * sizeof(RSM_DATA::__m_vec);
}
if(rsm.mode == 1)
{
rsm.frame_table_size *= 16 * sizeof(RSM_DATA::__m_vec);
}
if(rsm.mode == 2)
{
rsm.frame_table_size *= 16 * sizeof(DWORD);
}
rsm.data = (unsigned char*)realloc(rsm.data, rsm.period * (rsm.frame_table_size + 1));
rsm.tables = (RSM_DATA::__m_vec*)rsm.data;
rsm.needframes = rsm.data + rsm.frame_table_size * rsm.period;
double *weights = (double*)alloca(rsm.period * rsm.mix_frames * sizeof(double));
double *dst = weights;
unsigned low_bias = 0, dynamic_range = 0xFF;
if(conf.rsm.mode != RSM_SIMPLE)
{
unsigned fsize = rsm.period * rsm.mix_frames;
double *flt = (double*)alloca((fsize + 1) * sizeof(double));
double cutoff = 0.9;
if(conf.rsm.mode == RSM_FIR1)
{
cutoff = 0.5;
}
if(conf.rsm.mode == RSM_FIR2)
{
cutoff = 0.33333333;
}
cutoff *= 1 / (double)rsm.period; // cutoff scale = inftq/maxfq = 1/rsm.period
double c1 = 0.54 / M_PI, c2 = 0.46 / M_PI;
for(unsigned i = 0; i <= fsize; i++)
{
if(i == fsize / 2)
{
flt[i] = cutoff;
}
else
{
flt[i] = sin(M_PI*cutoff*((double)i - fsize / 2)) * (c1 - c2 * cos(2 * M_PI*(double)i / fsize)) / ((double)i - fsize / 2);
}
}
double low_b = 0, high_b = 0;
for(unsigned frame = 0; frame < rsm.period; frame++)
{
unsigned pos = frame * step, srcframe = pos / rsm.period;
if(frame)
{
srcframe++;
}// (pos % rsm.period) != 0
unsigned nextpos = pos + step, nextframe = nextpos / rsm.period;
if(frame + 1 != rsm.period)
{
nextframe++;
} // (nextpos % rsm.period) != 0
rsm.needframes[frame] = u8(nextframe - srcframe);
double low = 0, high = 0;
unsigned offset = (srcframe * rsm.period) - pos;
for(unsigned ch = 0; ch < rsm.mix_frames; ch++)
{
double weight = flt[offset] * rsm.period;
if(weight < 0)
{
low += weight;
}
else
{
high += weight;
}
*dst++ = weight;
offset += rsm.period;
}
if(low < low_b)
{
low_b = low;
}
if(high > high_b)
{
high_b = high;
}
}
low_bias = (unsigned)((-low_b) * 0xFF);
dynamic_range = (0xFF - low_bias);
}
else
{ // RSM_SIMPLE
double div = 1 / (double)step;
for(unsigned frame = 0; frame < rsm.period; frame++)
{
unsigned pos = frame * step, srcframe = pos / rsm.period;
unsigned nextpos = pos + step, nextframe = nextpos / rsm.period;
unsigned offset = (srcframe == nextframe) ? step : (nextpos - nextframe * rsm.period);
rsm.needframes[frame] = u8(nextframe - srcframe);
*dst++ = offset * div;
*dst++ = (step - offset) * div;
}
}
rsm.bias = 0x01010101 * low_bias;
unsigned char *dst32 = (unsigned char*)rsm.tables;
for(unsigned frame = 0; frame < rsm.period; frame++)
{
for(unsigned ch = 0; ch < rsm.mix_frames; ch++)
{
unsigned char *start_frame = dst32;
switch(rsm.mode)
{
case 0: // rgb 16/32bit - reorder table, MMX processes 2 truecolor pixels at one op
for(unsigned a = 0; a < 0x100; a++)
{
#ifdef MOD_SSE2
for(unsigned pix = 0; pix < 16; pix++) // 4 ў/с яшъёхы 16 ъюьсшэрЎшш
{
*(DWORD*)(dst32 + 0*4) = t.sctab32[0][((pix >> 3) & 1) * 0x100 + a]; // ╟эрўхэш pc тшфхюярь Єш xrgb32, тїюф 1 ў/с яшъёхы№ + рЄЁшсєЄ
*(DWORD*)(dst32 + 1*4) = t.sctab32[0][((pix >> 2) & 1) * 0x100 + a]; // ╟эрўхэш pc тшфхюярь Єш xrgb32, тїюф 1 ў/с яшъёхы№ + рЄЁшсєЄ
*(DWORD*)(dst32 + 2*4) = t.sctab32[0][((pix >> 1) & 1) * 0x100 + a]; // ╟эрўхэш pc тшфхюярь Єш xrgb32, тїюф 1 ў/с яшъёхы№ + рЄЁшсєЄ
*(DWORD*)(dst32 + 3*4) = t.sctab32[0][((pix >> 0) & 1) * 0x100 + a]; // ╟эрўхэш pc тшфхюярь Єш xrgb32, тїюф 1 ў/с яшъёхы№ + рЄЁшсєЄ
dst32 += 4*4;
}
#else
for(unsigned pix = 0; pix < 4; pix++) // 2 ў/с яшъёхы 4 ъюьсшэрЎшш
{
*(DWORD*)(dst32 + 0) = t.sctab32[0][(pix >> 1) * 0x100 + a]; // ╟эрўхэш pc тшфхюярь Єш xrgb32, тїюф 1 ў/с яшъёхы№ + рЄЁшсєЄ
*(DWORD*)(dst32 + 4) = t.sctab32[0][(pix & 1) * 0x100 + a]; // ╟эрўхэш pc тшфхюярь Єш xrgb32, тїюф 1 ў/с яшъёхы№ + рЄЁшсєЄ
dst32 += 8;
}
#endif // MOD_SSE2
}
break;
case 1: // YUY2 overlay - reorder table, MMX processes 4 overlay pixels at one op
for(unsigned a = 0; a < 0x100; a++)
{
for(unsigned pix = 0; pix < 16; pix++) // 4 ў/с яшъёхы 16 ъюьсшэрЎшш
{
*(DWORD*)(dst32 + 0) = t.sctab16[0][a * 4 + (pix >> 2)]; // ╧ю 2 чэрўхэш pc тшфхюярь Єш, тїюф 2 ў/с яшъёхы + рЄЁшсєЄ
*(DWORD*)(dst32 + 4) = t.sctab16[0][a * 4 + (pix & 3)]; // ╧ю 2 чэрўхэш pc тшфхюярь Єш, тїюф 2 ў/с яшъёхы + рЄЁшсєЄ
dst32 += 8;
}
}
break;
case 2: // 8bit, tables: 16*attr+4pix => 4 pixels in DWORD
memcpy(dst32, t.sctab8[0], 0x100 * 16 * sizeof(DWORD));
dst32 += 0x100 * 16 * sizeof(DWORD);
break;
default:
__assume(0);
}
double scale = *weights++ * dynamic_range * (1.0 / 256.0);
for(unsigned char *ptr = start_frame; ptr < dst32; ptr++)
{
double color = scale * *ptr;
if(color > 255.0)
{
color = 255.0;
}
if(color < 0.0)
{
color += 256.0;
} // color = 0.0;
*ptr = (unsigned char)color;
}
}
}
}