# -*- coding: utf-8 -*-

"""
This python module implements the 2nd order HLL flux

Copyright (C) 2016  SINTEF ICT

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

#Import packages we need
from GPUSimulators import Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
import numpy as np

from pycuda import gpuarray


"""
Class that solves the SW equations using the Forward-Backward linear scheme
"""
class EE2D_KP07_dimsplit (BaseSimulator):

    """
    Initialization routine
    rho: Density
    rho_u: Momentum along x-axis
    rho_v: Momentum along y-axis
    E: energy
    nx: Number of cells along x-axis
    ny: Number of cells along y-axis
    dx: Grid cell spacing along x-axis
    dy: Grid cell spacing along y-axis
    dt: Size of each timestep 
    g: Gravitational constant
    gamma: Gas constant
    p: pressure
    """
    def __init__(self, 
                 context, 
                 rho, rho_u, rho_v, E, 
                 nx, ny, 
                 dx, dy,  
                 g, 
                 gamma, 
                 theta=1.3, 
                 cfl_scale=0.9,
                 boundary_conditions=BoundaryCondition(), 
                 block_width=16, block_height=8):
                 
        # Call super constructor
        super().__init__(context, 
            nx, ny, 
            dx, dy, 
            boundary_conditions,
            cfl_scale, 
            2, 
            block_width, block_height)
        self.g = np.float32(g)
        self.gamma = np.float32(gamma)
        self.theta = np.float32(theta) 

        #Get kernels
        module = context.get_module("cuda/EE2D_KP07_dimsplit.cu", 
                                        defines={
                                            'BLOCK_WIDTH': self.block_size[0], 
                                            'BLOCK_HEIGHT': self.block_size[1]
                                        }, 
                                        compile_args={
                                            'no_extern_c': True,
                                            'options': ["--use_fast_math"], 
                                        }, 
                                        jit_compile_args={})
        self.kernel = module.get_function("KP07DimsplitKernel")
        self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii")
        
        
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, 
                        nx, ny, 
                        2, 2, 
                        [rho, rho_u, rho_v, E])
        self.u1 = Common.ArakawaA2D(self.stream, 
                        nx, ny, 
                        2, 2, 
                        [None, None, None, None])
        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
        dt_x = np.min(self.dx / (np.abs(rho_u/rho) + np.sqrt(gamma*rho)))
        dt_y = np.min(self.dy / (np.abs(rho_v/rho) + np.sqrt(gamma*rho)))
        self.dt = min(dt_x, dt_y)
        self.cfl_data.fill(self.dt, stream=self.stream)
                        
    
    def substep(self, dt, step_number, external=True, internal=True):
            self.substepDimsplit(0.5*dt, step_number, external, internal)
    
    def substepDimsplit(self, dt, substep, external, internal):
        if external and internal:
            #print("COMPLETE DOMAIN (dt=" + str(dt) + ")")

            self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
                self.nx, self.ny, 
                self.dx, self.dy, dt, 
                self.g, 
                self.gamma, 
                self.theta, 
                substep,
                self.boundary_conditions, 
                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
                self.cfl_data.gpudata,
                0, 0, 
                self.nx, self.ny)
            return
        
        if external and not internal:
            #############################################################
            # XXX: Only treating north and south external cells for now #
            #############################################################

            ns_grid_size = (self.grid_size[0], 1)

            # NORTH
            # (x0, y0) x (x1, y1)
            #  (0, ny-y_halo) x (nx, ny)
            self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, 
                self.nx, self.ny,
                self.dx, self.dy, dt, 
                self.g, 
                self.gamma, 
                self.theta, 
                substep,
                self.boundary_conditions, 
                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
                self.cfl_data.gpudata,
                0, self.ny - int(self.u0[0].y_halo),
                self.nx, self.ny)

            # SOUTH
            # (x0, y0) x (x1, y1)
            #   (0, 0) x (nx, y_halo)
            self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, 
                self.nx, self.ny,
                self.dx, self.dy, dt, 
                self.g, 
                self.gamma, 
                self.theta, 
                substep,
                self.boundary_conditions, 
                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
                self.cfl_data.gpudata,
                0, 0,
                self.nx, int(self.u0[0].y_halo))
            return

        if internal and not external:
            #############################################################
            # XXX: Only treating north and south external cells for now #
            #      So we need to include west and east boundary here!   #
            #############################################################

            # INTERNAL DOMAIN
            #         (x0, y0) x (x1, y1)
            # (x_halo, y_halo) x (nx - x_halo, ny - y_halo)
            self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream, 
                self.nx, self.ny, 
                self.dx, self.dy, dt, 
                self.g, 
                self.gamma, 
                self.theta, 
                substep,
                self.boundary_conditions, 
                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
                self.cfl_data.gpudata,
                0, int(self.u0[0].y_halo),
                self.nx, self.ny - int(self.u0[0].y_halo))
            return

    def swapBuffers(self):
        self.u0, self.u1 = self.u1, self.u0
        return
        
    def getOutput(self):
        return self.u0

    def check(self):
        self.u0.check()
        self.u1.check()
        return
        
    def computeDt(self):
        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
        return max_dt*0.5