#version 450 core

#define HIRES
#define RESTART_IND_UINT 0xFFFFFFFF
#ifdef HIRES
#define FFTXTSIZE 36
#else
#define FFTXTSIZE 32
#endif
#define FFTXTPIX       (3*FFTXTSIZE*FFTXTSIZE)
#define STEPS     12  /* ceil log2 FFTXTPIX */
#define PCS        4
#define SPCS       3
#define SIZEX  (FFTXTPIX/PCS)  /* N/PCS */

layout(local_size_x=SIZEX) in;

layout(std430,binding=3) buffer FFWeightBuf { float w[]; } wbuf;
layout(std430,binding=4) buffer FFOnesBuf { uint k[]; } obuf;
layout(std430,binding=7) buffer FFPixBuf { uint pix[]; } pxbuf;

shared uint  cache[FFTXTPIX];
shared float fcache[FFTXTPIX];

void CompSwap ( uint i, uint j )
{
  uint  c;
  float f;

  if ( cache[i] > cache[j] ) {
    c = cache[i];   cache[i] = cache[j];    cache[j] = c;
    f = fcache[i];  fcache[i] = fcache[j];  fcache[j] = f;
  }
} /*CompSwap*/

void main ( void )
{
  uint x, y, iid[SPCS], id[SPCS], s, i, j, h, h2, h4, k, l,
       ii[PCS/2], m0, m1;

  x = gl_LocalInvocationID.x;
  y = FFTXTPIX*gl_WorkGroupID.x + x;
        /* czytanie danych do cache'a */
  for ( k = 0, i = x, j = y;  k < PCS;  k++, i += SIZEX, j += SIZEX )
    fcache[i] = (cache[i] = pxbuf.pix[j]) != RESTART_IND_UINT ?
                wbuf.w[i] : 0.0;
  barrier ();
        /* sortowanie */
  for ( iid[0] = x, k = 1;  k < SPCS;  k++ )
    iid[k] = iid[k-1]+SIZEX;
  for ( s = 0, h = 2, h2 = 1;  s < STEPS;  s++, h2 = h, h += h ) {
    for ( k = 0; k < SPCS; k++ ) {
      l = iid[k] % h2;  id[k] = iid[k] / h2;  i = id[k]*h + l;
      if ( (j = (id[k]+1)*h-l-1) < FFTXTPIX )
        CompSwap ( i, j );
    }
    barrier ();
    for ( h4 = h2 / 2;  h2 > 1;  h2 = h4, h4 /= 2 ) {
      for ( k = 0; k < SPCS; k++ ) {
        l = iid[k] % h4;  id[k] = iid[k] / h4;  i = id[k]*h2 + l;
        if ( (j = i+h4) < FFTXTPIX )
          CompSwap ( i, j );
      }
      barrier ();
    }
  }
        /* posortowane dane przepisz do buforow */
  for ( k = 0, i = x, j = y;  k < PCS;  k++, i += SIZEX, j += SIZEX )
    { pxbuf.pix[j] = cache[i];  wbuf.w[j+FFTXTPIX] = fcache[i]; }
  barrier ();
        /* teraz do cache'a wpisz zera i jedynki */
  for ( k = 0, i = x, j = y;  k < PCS;  k++, i += SIZEX, j += SIZEX ) {
    if ( pxbuf.pix[j] == RESTART_IND_UINT )
      cache[i] = 0;
    else if ( i == 0 )
      cache[i] = 1;
    else
      cache[i] = int ( pxbuf.pix[j] > pxbuf.pix[j-1] );
  }
  barrier ();
        /* oblicz sumy prefiksowe */
  ii[0] = x+x;
  for ( k = 1; k < PCS/2; k++ )
    ii[k] = ii[k-1] + (SIZEX+SIZEX);
  for ( m0 = 0x01, m1 = 0;  m0 < FFTXTPIX;  m1 = (m0 += m0)-1 ) {
    for ( k = 0; k < PCS/2; k++ ) {
      i = (ii[k] & ~m0) | m1;
      if ( (j = i + (iid[k] & m1) + 1) < FFTXTPIX )
        cache[j] += cache[i];
    }
    barrier ();
  }
        /* przepisz sumy prefiksowe do bufora */
  for ( k = 0, i = x, j = y;  k < PCS;  k++, i += SIZEX, j += SIZEX )
    obuf.k[j] = cache[i];
} /*main*/

