/**
This OpenCL kernel implements part of the Centered in Time, Centered 
in Space (leapfrog) numerical scheme for the shallow water equations, 
described in 
L. P. Røed, "Documentation of simple ocean models for use in ensemble
predictions", Met no report 2012/3 and 2012/5 .

Copyright (C) 2016  SINTEF ICT

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/


#define block_height 8
#define block_width 8


typedef __local float eta_shmem[block_height+2][block_width+1];
typedef __local float u_shmem[block_height+2][block_width+2];
typedef __local float v_shmem[block_height+1][block_width+1];


float windStressX(int wind_stress_type_,
                float dx_, float dy_, float dt_,
                float tau0_, float rho_, float alpha_, float xm_, float Rc_,
                float x0_, float y0_,
                float u0_, float v0_,
                float t_) {
    
    float X = 0.0f;
    
    switch (wind_stress_type_) {
    case 0: //UNIFORM_ALONGSHORE
        {
            const float y = (get_global_id(1)+0.5f)*dy_;
            X = tau0_/rho_ * exp(-alpha_*y);
        }
        break;
    case 1: //BELL_SHAPED_ALONGSHORE
        if (t_ <= 48.0f*3600.0f) {
            const float a = alpha_*((get_global_id(0)+0.5f)*dx_-xm_);
            const float aa = a*a;
            const float y = (get_global_id(1)+0.5f)*dy_;
            X = tau0_/rho_ * exp(-aa) * exp(-alpha_*y);
        }
        break;
    case 2: //MOVING_CYCLONE
        {
            const float x = (get_global_id(0))*dx_;
            const float y = (get_global_id(1)+0.5f)*dy_;
            const float a = (x-x0_-u0_*(t_+dt_));
            const float aa = a*a;
            const float b = (y-y0_-v0_*(t_+dt_));
            const float bb = b*b;
            const float r = sqrt(aa+bb);
            const float c = 1.0f - r/Rc_;
            const float xi = c*c;
            
            X = -(tau0_/rho_) * (b/Rc_) * exp(-0.5f*xi);
        }
        break;
    }

    return X;
}


/**
  * Kernel that evolves U one step in time.
  */
__kernel void computeUKernel(
        //Discretization parameters
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
    
        //Physical parameters
        float g_, //< Gravitational constant
        float f_, //< Coriolis coefficient
        float r1_, //< Inter-layer friction coefficient
        float r2_, //< Bottom friction coefficient
    
        //Numerical diffusion
        float A_,
        
        //Density of each layer
        float rho1_,
        float rho2_,
    
        //Data for layer 1
        __global float* H1_ptr_, int H1_pitch_,
        __global float* eta1_1_ptr_, int eta1_1_pitch_, // eta^n
        __global float* U1_0_ptr_, int U1_0_pitch_, // U^n-1, also output, U^n+1
        __global float* U1_1_ptr_, int U1_1_pitch_, // U^n
        __global float* V1_1_ptr_, int V1_1_pitch_, // V^n
        
        //Data for layer 2
        __global float* H2_ptr_, int H2_pitch_,
        __global float* eta2_1_ptr_, int eta2_1_pitch_, // eta^n
        __global float* U2_0_ptr_, int U2_0_pitch_, // U^n-1, also output, U^n+1
        __global float* U2_1_ptr_, int U2_1_pitch_, // U^n
        __global float* V2_1_ptr_, int V2_1_pitch_, // V^n
    
        // Wind stress parameters
        int wind_stress_type_, 
        float tau0_, float alpha_, float xm_, float Rc_,
        float x0_, float y0_,
        float u0_, float v0_,
        float t_) {
                    
    eta_shmem H1_shared;
    eta_shmem eta1_shared;
    u_shmem U1_shared;
    v_shmem V1_shared;
    
    eta_shmem H2_shared;
    eta_shmem eta2_shared;
    u_shmem U2_shared;
    v_shmem V2_shared;
   
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
    //Start of block within domain
    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells

    //Index of cell within domain
    const int ti = bx + tx;
    const int tj = by + ty;
    
    //Compute pointer to current row in the U array
    __global float* const U1_0_row = (__global float*) ((__global char*) U1_0_ptr_ + U1_0_pitch_*tj);
    __global float* const U2_0_row = (__global float*) ((__global char*) U2_0_ptr_ + U2_0_pitch_*tj);

    //Read current U
    float U1_0 = 0.0f;
    float U2_0 = 0.0f;
    if (ti > 0 && ti < nx_ && tj > 0 && tj < ny_+1) {
        U1_0 = U1_0_row[ti];
        U2_0 = U2_0_row[ti];
    }

    //Read H and eta into shared memory: (nx+1)*(ny+2) cells
    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
        // "fake" global ghost cells by clamping
        const int l = clamp(by + j - 1, 1, ny_);
        
        //Compute the pointer to current row in the H and eta arrays
        __global float* const H1_row = (__global float*) ((__global char*) H1_ptr_ + H1_pitch_*l);
        __global float* const H2_row = (__global float*) ((__global char*) H2_ptr_ + H2_pitch_*l);
        
        __global float* const eta1_1_row = (__global float*) ((__global char*) eta1_1_ptr_ + eta1_1_pitch_*l);
        __global float* const eta2_1_row = (__global float*) ((__global char*) eta2_1_ptr_ + eta2_1_pitch_*l);
        
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            // "fake" global ghost cells by clamping
            const int k = clamp(bx + i, 1, nx_);
            
            H1_shared[j][i] = H1_row[k];
            H2_shared[j][i] = H2_row[k];
            
            eta1_shared[j][i] = eta1_1_row[k];
            eta2_shared[j][i] = eta2_1_row[k];
        }
    }

    //Read U into shared memory: (nx+2)*(ny+2) cells
    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
        // "fake" ghost cells by clamping
        const int l = clamp(by + j - 1, 1, ny_);
        
        //Compute the pointer to current row in the U array
        __global float* const U1_1_row = (__global float*) ((__global char*) U1_1_ptr_ + U1_1_pitch_*l);
        __global float* const U2_1_row = (__global float*) ((__global char*) U2_1_ptr_ + U2_1_pitch_*l);
        
        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
            // Prevent out-of-bounds
            const int k = clamp(bx + i - 1, 0, nx_);
            
            U1_shared[j][i] = U1_1_row[k];
            U2_shared[j][i] = U2_1_row[k];
        }
    }
    

    //Read V into shared memory: (nx+1)*(ny+1) cells
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        // Prevent out-of-bounds
        const int l = clamp(by + j - 1, 0, ny_);
        
        //Compute the pointer to current row in the V array
        __global float* const V1_1_row = (__global float*) ((__global char*) V1_1_ptr_ + V1_1_pitch_*l);
        __global float* const V2_1_row = (__global float*) ((__global char*) V2_1_ptr_ + V2_1_pitch_*l);
        
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            // "fake" ghost cells by clamping
            const int k = clamp(bx + i, 1, nx_);
            
            V1_shared[j][i] = V1_1_row[k];
            V2_shared[j][i] = V2_1_row[k];
        }
    }
    
    //Make sure all threads have read into shared mem
    barrier(CLK_LOCAL_MEM_FENCE);
    
    
    /**
      * Now get all our required variables as short-hands
      * here we use the notation of
      *  Var1_00 as var_i,j for layer 1
      *  Var2_p0 as var_i+1,j for layer 2
      *  Var1_0m as var_i,j-1 for layer 1
      * etc
      */
    //Layer 1
    const float U1_00 = U1_shared[ty+1][tx+1]; //U at "center"
    const float U1_0p = U1_shared[ty+2][tx+1]; //U at "north"
    const float U1_0m = U1_shared[ty  ][tx+1]; //U at "south"
    const float U1_p0 = U1_shared[ty+1][tx+2]; //U at "east"
    const float U1_m0 = U1_shared[ty+1][tx  ]; //U at "west"
    
    const float V1_00 = V1_shared[ty+1][tx  ];
    const float V1_p0 = V1_shared[ty+1][tx+1];
    const float V1_0m = V1_shared[ty  ][tx  ];
    const float V1_pm = V1_shared[ty  ][tx+1];
    
    const float H1_0m = H1_shared[ty  ][tx  ]; 
    const float H1_00 = H1_shared[ty+1][tx  ]; 
    const float H1_0p = H1_shared[ty+2][tx  ];
    const float H1_pm = H1_shared[ty  ][tx+1];
    const float H1_p0 = H1_shared[ty+1][tx+1]; 
    const float H1_pp = H1_shared[ty+2][tx+1];
    
    const float eta1_0m = eta1_shared[ty  ][tx  ]; 
    const float eta1_00 = eta1_shared[ty+1][tx  ]; 
    const float eta1_0p = eta1_shared[ty+2][tx  ];
    const float eta1_pm = eta1_shared[ty  ][tx+1];
    const float eta1_p0 = eta1_shared[ty+1][tx+1]; 
    const float eta1_pp = eta1_shared[ty+2][tx+1];
    
    
    //Layer 2 (bottom)
    const float U2_00 = U2_shared[ty+1][tx+1]; 
    const float U2_0p = U2_shared[ty+2][tx+1]; 
    const float U2_0m = U2_shared[ty  ][tx+1]; 
    const float U2_p0 = U2_shared[ty+1][tx+2]; 
    const float U2_m0 = U2_shared[ty+1][tx  ]; 
    
    const float V2_00 = V2_shared[ty+1][tx  ];
    const float V2_p0 = V2_shared[ty+1][tx+1];
    const float V2_0m = V2_shared[ty  ][tx  ];
    const float V2_pm = V2_shared[ty  ][tx+1];

    const float H2_0m = H2_shared[ty  ][tx  ]; 
    const float H2_00 = H2_shared[ty+1][tx  ]; 
    const float H2_0p = H2_shared[ty+2][tx  ];
    const float H2_pm = H2_shared[ty  ][tx+1];
    const float H2_p0 = H2_shared[ty+1][tx+1]; 
    const float H2_pp = H2_shared[ty+2][tx+1];
    
    const float eta2_0m = eta2_shared[ty  ][tx  ]; 
    const float eta2_00 = eta2_shared[ty+1][tx  ]; 
    const float eta2_0p = eta2_shared[ty+2][tx  ];
    const float eta2_pm = eta2_shared[ty  ][tx+1];
    const float eta2_p0 = eta2_shared[ty+1][tx+1]; 
    const float eta2_pp = eta2_shared[ty+2][tx+1];

    
    //Reconstruct Eta_bar at the V position
    const float eta1_bar_0m = 0.25f*(eta1_0m + eta1_pm + eta1_00 + eta1_p0);
    const float eta1_bar_00 = 0.25f*(eta1_00 + eta1_p0 + eta1_0p + eta1_pp);
    
    const float eta2_bar_0m = 0.25f*(eta2_0m + eta2_pm + eta2_00 + eta2_p0);
    const float eta2_bar_00 = 0.25f*(eta2_00 + eta2_p0 + eta2_0p + eta2_pp);

    
    //Reconstruct H_bar and H_x (at the U position)
    const float H1_bar_0m = 0.25f*(H1_0m + H1_pm + H1_00 + H1_p0);
    const float H1_bar_00 = 0.25f*(H1_00 + H1_p0 + H1_0p + H1_pp);
    const float H1_x = 0.5f*(H1_00 + H1_p0);
    
    const float H2_bar_0m = 0.25f*(H2_0m + H2_pm + H2_00 + H2_p0);
    const float H2_bar_00 = 0.25f*(H2_00 + H2_p0 + H2_0p + H2_pp);
    const float H2_x = 0.5f*(H2_00 + H2_p0);
    
    
    //Compute layer thickness of top layer
    const float h1_p0 = H1_p0 + eta1_p0 - eta2_p0;
    const float h1_00 = H1_00 + eta1_00 - eta2_00;
    const float h1_bar_0m = H1_bar_0m + eta1_bar_0m - eta2_bar_0m;
    const float h1_bar_00 = H1_bar_00 + eta1_bar_00 - eta2_bar_00;
    
    const float h2_p0 = H2_p0 + eta2_p0;
    const float h2_00 = H2_00 + eta2_00;
    const float h2_bar_0m = H2_bar_0m + eta2_bar_0m;
    const float h2_bar_00 = H2_bar_00 + eta2_bar_00;
    
    
    //Compute pressure components
    const float h1_x = 0.5f*(h1_p0 + h1_00);
    const float h2_x = 0.5f*(h2_p0 + h2_00);
    
    //const float epsilon = (rho2_ - rho1_)/rho2_;
    //const float P1_x = -g_*h1_x * (eta1_p0 - eta1_00 + h2_p0 - h2_00) * (1.0f - epsilon);
    //const float P2_x = -g_*h2_x * (eta2_p0 - eta2_00 + H2_p0 - H2_00);
    
    const float P1_x = - g_*h1_x*(eta1_p0 - eta1_00) - 0.5f*g_*(eta1_p0*eta1_p0 - eta1_00*eta1_00);
    const float P2_x = - g_ * (rho1_/rho2_) * 
                            ( //Pressure contribution from top layer
                            h2_x*(eta1_p0 - eta1_00) + 0.5f*(eta1_p0*eta1_p0 - eta1_00*eta1_00) 
                            )
                       - g_ * ((rho2_ - rho1_)/rho2_) * 
                            ( //Pressure contribution from bottom layer
                            h2_x*(eta2_p0 - eta2_00) + 0.5f*(eta2_p0*eta2_p0 - eta2_00*eta2_00) 
                            );
    

    //Reconstruct V at the U position
    const float V1_bar = 0.25f*(V1_0m + V1_00 + V1_pm + V1_p0);
    const float V2_bar = 0.25f*(V2_0m + V2_00 + V2_pm + V2_p0);

    
    //Calculate the bottom and/or inter-layer friction coefficient
    //FIXME: Should this be h instead of H?
    const float C1 = r1_/H1_x;
    const float C2 = r2_/H2_x;

    
    //Calculate numerical diffusion / subgrid energy loss coefficient
    const float D = 2.0f*A_*dt_*(dx_*dx_ + dy_*dy_)/(dx_*dx_*dy_*dy_);
    
    
    //Calculate nonlinear effects
    const float N1_a = (U1_p0 + U1_00)*(U1_p0 + U1_00) / (h1_p0);
    const float N1_b = (U1_00 + U1_m0)*(U1_00 + U1_m0) / (h1_00);
    const float N1_c = (U1_0p + U1_00)*(V1_p0 + V1_00) / (h1_bar_00);
    const float N1_d = (U1_00 + U1_0m)*(V1_pm + V1_0m) / (h1_bar_0m);
    const float N1 = 0.25f*( N1_a - N1_b + (dx_/dy_)*(N1_c - N1_d) );
    
    const float N2_a = (U2_p0 + U2_00)*(U2_p0 + U2_00) / (h2_p0);
    const float N2_b = (U2_00 + U2_m0)*(U2_00 + U2_m0) / (h2_00);
    const float N2_c = (U2_0p + U2_00)*(V2_p0 + V2_00) / (h2_bar_00);
    const float N2_d = (U2_00 + U2_0m)*(V2_pm + V2_0m) / (h2_bar_0m);
    const float N2 = 0.25f*( N2_a - N2_b + (dx_/dy_)*(N2_c - N2_d) );
    
    
    //Calculate eddy viscosity terms
    const float E1 = (U1_p0 - U1_0 + U1_m0)/(dx_*dx_) + (U1_0p - U1_0 + U1_0m)/(dy_*dy_);
    const float E2 = (U2_p0 - U2_0 + U2_m0)/(dx_*dx_) + (U2_0p - U2_0 + U2_0m)/(dy_*dy_);
    
    
    //Calculate the wind shear stress for the top layer
    const float X = windStressX(
        wind_stress_type_, 
        dx_, dy_, dt_,
        tau0_, rho1_, alpha_, xm_, Rc_,
        x0_, y0_,
        u0_, v0_,
        t_);
    
    
    //Compute U at the next timestep
    float U1_2 = (U1_0 + 2.0f*dt_*(f_*V1_bar + (N1 + P1_x)/dx_ + X + C1*U2_0 + A_*E1) ) / (1.0f + D);
    float U2_2 = (U2_0 + 2.0f*dt_*(f_*V2_bar + (N2 + P2_x)/dx_     + C1*U1_0 + A_*E2) ) / (1.0f + 2.0f*dt_*C2 + D);

    
    //Write to main memory for internal cells
    if (ti > 0 && ti < nx_ && tj > 0 && tj < ny_+1) {
        U1_0_row[ti] = U1_2;
        U2_0_row[ti] = U2_2;
    }
}