//-------------------------------------------------------------------------------------------------
// Matrox10bitUnpack.fx
//
// Copyright (c) LWKS Software Ltd.  All Rights Reserved
//-------------------------------------------------------------------------------------------------

texture gSourceImage; // The source image

sampler SourceImageSampler = 
sampler_state
{
   Texture = <gSourceImage>;
   MipFilter = POINT;
   MinFilter = POINT;
   MagFilter = POINT; //GAUSSIANQUAD; //POINT; //LINEAR;
   AddressU  = ClampToEdge;
   AddressV  = ClampToEdge;
};

float pSourceImageTotalWidthInDWORDs;

float4 ps_main( float2 uv1 : TEXCOORD1 ) : COLOR0
{
   // The source surface must be in A8R8G8B8 format, wrapping Matrox-format data. The rendertarget is expected
   // to be in X16R16G16B16. The output data will be in 16-bit YCrCb422 format, with values normalised to the range 0.0->1.0.
   //

   // Matrox 10 bit YCrCb 4:2:2 format is packed as follows:
   //
   //               |------------------------------------------ BASE BLOCK --------------------||---------------- EXTRAS BLOCK ----------------|
   // DWORD offset  |----------- 0 ---------||---------- 1 ---------|.. |---------- 7 ---------||---------- 8 ---------||---------- 9 ---------|
   // Byte Offset   0      1     2     3     4     5     6     7     .. 28    29    30    31    32    33    34    35    36    37    38    39
   // Value         Y0     Cb0   Y1    Cr0   Y2    Cb2   Y3    Cr2   .. Y14   Cb14  Y15   Cr14  Packed high-order 2 bits of preceeding pixels, order TBC

   const int   kBlockSizeInComponentValuesI = 32;

   const int   kBaseBlockSizeInDWORDsI = kBlockSizeInComponentValuesI / 4;       // eg, 8
   const float kBaseBlockSizeInDWORDsF = (float)kBaseBlockSizeInDWORDsI;         //     8.0

   const int   kExtrasBlockSizeInDWORDsI = kBlockSizeInComponentValuesI / 16;    //     2
   const float kExtrasBlockSizeInDWORDsF = (float)kExtrasBlockSizeInDWORDsI;     //     2.0

   const int   kTotalBlockSizeInDWORDsI = kBaseBlockSizeInDWORDsI + kExtrasBlockSizeInDWORDsI;  // 10
   const float kTotalBlockSizeInDWORDsF = (float)kTotalBlockSizeInDWORDsI;       // 10

   // TEXCOORD0: y = output row 0.0->1.0 format. x = output column, 0 - ( SourceImageWidth - 1 ) format.

   float phase = fmod( uv1.x, kBaseBlockSizeInDWORDsF );
   float block = floor( uv1.x / kBaseBlockSizeInDWORDsF );

   // Determine the x-coord where the four 8-bit values will come from:
   float xDWORDPixelOffset = floor( ( block * kTotalBlockSizeInDWORDsF ) + phase );

   // Determine the x-coord where of the DWORD the byte of 2-bit extras will come from
   float xDWORDExtrasOffset = ( block * kTotalBlockSizeInDWORDsF ) + kBaseBlockSizeInDWORDsF + ( phase / 4 );

   // We'll end up with a float4 of extras wrapping 4 bytes. We need just one byte of that. Determine the
   // vector index into the float4 which we need to use.
   float xExtrasVectorIndex = fmod( phase, 4.0 );

   uv1.x = ( xDWORDPixelOffset + 0.5 ) / pSourceImageTotalWidthInDWORDs;

   // start with the 8 bit value from the input
   float4 base;
   base = tex2D( SourceImageSampler, uv1 );

   // set extras data position
   uv1.x = ( xDWORDExtrasOffset + 0.1 ) / pSourceImageTotalWidthInDWORDs;

   // sample extra 2 bitvalue from the source block
   float4 extras;
   extras.argb = tex2D( SourceImageSampler, uv1 ).abgr;

   // extract required byte as 0..255
   float extraB = floor( ( 255.0 * extras[ xExtrasVectorIndex ] ) + 0.5 );

   // base[n] contains the most significant 8 bits, with the two additional LSBs coming from two bits of extraB.

   // We need to get the four 2-bit numbers out of extraB. We don't have bitwise operators like &, >> or <<, so
   // this is a bit messy. Also, integers may well be emulated using floating point in the GPU, so we can't rely
   // on 'normal' integer rounding and overflow behaviour.

   float bb76 = floor( ( extraB / 64.0 ) + ( 0.5 / 64.0 ) );
   float bb54 = floor( ( ( extraB - ( bb76 * 64.0 ) ) / 16.0 ) + ( 0.5 / 16.0 ) );
   float bb32 = floor( ( ( extraB - ( ( bb76 * 64.0 ) + ( bb54 * 16.0 ) ) ) / 4.0 ) + ( 0.5 / 4.0 ) );
   float bb10 = floor( ( extraB - ( ( bb76 * 64.0 ) + ( bb54 * 16.0 )  + ( bb32 * 4.0 ) ) ) + 0.5 );

   // create array of 4 2 bit values corresponding to Y0 Cr0 Y1 Cb0
   float4 bbf;
   bbf[0] = bb10;	// Y0
   bbf[1] = bb32;	// Cb0
   bbf[2] = bb54;	// Y1
   bbf[3] = bb76;	// Cr0

   float4 ret;

   // Y values are offset by 16/256
   // Cr/Cb values are offset by 128/256

   // merge in bottom 2 bits and scale to 0..1 from 0..1023?
   ret = ( base * 255.0f * 4.0f + bbf );

   // scale 10bit value to 16 bit
   return ret * 64.0f / 65535.0f;
   // approximately 58 instruction slots used (2 texture, 56 arithmetic)
};

technique T1 { pass P1 { PixelShader = compile PROFILE ps_main(); } }

