# -*- coding: utf-8 -*- """ This python module implements the Centered in Time, Centered in Space (leapfrog) numerical scheme for the shallow water equations, described in L. P. Røed, "Documentation of simple ocean models for use in ensemble predictions", Met no report 2012/3 and 2012/5 . Copyright (C) 2016 SINTEF ICT This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ #Import packages we need import os import time import numpy as np import pyopencl as cl #OpenCL in Python """ Class that holds data for the SW equations in OpenCL """ class CTCS2LayerDataCL: """ Uploads initial data to the CL device """ def __init__(self, cl_ctx, h1_0, eta1_0, u1_0, v1_0, \ h2_0, eta2_0, u2_0, v2_0): #Make sure that the data is single precision floating point if (not np.issubdtype(h1_0.dtype, np.float32) or np.isfortran(h1_0)): print "Converting H_0" h1_0 = h1_0.astype(np.float32, order='C') if (not np.issubdtype(eta1_0.dtype, np.float32) or np.isfortran(eta1_0)): print "Converting Eta_0" eta1_0 = eta1_0.astype(np.float32, order='C') if (not np.issubdtype(u1_0.dtype, np.float32) or np.isfortran(u1_0)): print "Converting U_0" u1_0 = u1_0.astype(np.float32, order='C') if (not np.issubdtype(v1_0.dtype, np.float32) or np.isfortran(v1_0)): print "Converting V_0" v1_0 = v1_0.astype(np.float32, order='C') #Same for second (deepest) layer if (not np.issubdtype(h2_0.dtype, np.float32) or np.isfortran(h2_0)): print "Converting H2_0" h2_0 = h2_0.astype(np.float32, order='C') if (not np.issubdtype(eta2_0.dtype, np.float32) or np.isfortran(eta2_0)): print "Converting Eta2_0" eta2_0 = eta2_0.astype(np.float32, order='C') if (not np.issubdtype(u2_0.dtype, np.float32) or np.isfortran(u2_0)): print "Converting U2_0" u2_0 = u2_0.astype(np.float32, order='C') if (not np.issubdtype(v2_0.dtype, np.float32) or np.isfortran(v2_0)): print "Converting V2_0" v2_0 = v2_0.astype(np.float32, order='C') self.ny, self.nx = h1_0.shape self.nx = self.nx - 2 # Ghost cells self.ny = self.ny - 2 assert(h1_0.shape == (self.ny+2, self.nx+2)) assert(eta1_0.shape == (self.ny+2, self.nx+2)) assert(u1_0.shape == (self.ny+2, self.nx+1)) assert(v1_0.shape == (self.ny+1, self.nx+2)) #Same for layer 2 assert(h2_0.shape == (self.ny+2, self.nx+2)) assert(eta2_0.shape == (self.ny+2, self.nx+2)) assert(u2_0.shape == (self.ny+2, self.nx+1)) assert(v2_0.shape == (self.ny+1, self.nx+2)) #Upload data to the device mf = cl.mem_flags self.h1_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=h1_0) self.eta1_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=eta1_0) self.eta1_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=eta1_0) self.u1_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=u1_0) self.u1_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=u1_0) self.v1_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=v1_0) self.v1_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=v1_0) #Same for layer 2 self.h2_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=h2_0) self.eta2_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=eta2_0) self.eta2_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=eta2_0) self.u2_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=u2_0) self.u2_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=u2_0) self.v2_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=v2_0) self.v2_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=v2_0) #Compute pitches self.h1_0_pitch = np.int32(h1_0.shape[1]*4) self.eta1_0_pitch = np.int32(eta1_0.shape[1]*4) self.eta1_1_pitch = np.int32(eta1_0.shape[1]*4) self.u1_0_pitch = np.int32(u1_0.shape[1]*4) self.u1_1_pitch = np.int32(u1_0.shape[1]*4) self.v1_0_pitch = np.int32(v1_0.shape[1]*4) self.v1_1_pitch = np.int32(v1_0.shape[1]*4) #Same for layer 2 self.h2_0_pitch = np.int32(h2_0.shape[1]*4) self.eta2_0_pitch = np.int32(eta2_0.shape[1]*4) self.eta2_1_pitch = np.int32(eta2_0.shape[1]*4) self.u2_0_pitch = np.int32(u2_0.shape[1]*4) self.u2_1_pitch = np.int32(u2_0.shape[1]*4) self.v2_0_pitch = np.int32(v2_0.shape[1]*4) self.v2_1_pitch = np.int32(v2_0.shape[1]*4) """ Swaps the variables after a timestep has been completed """ def swap(self): self.eta1_1, self.eta1_0 = self.eta1_0, self.eta1_1 self.u1_1, self.u1_0 = self.u1_0, self.u1_1 self.v1_1, self.v1_0 = self.v1_0, self.v1_1 #Same for layer 2 self.eta2_1, self.eta2_0 = self.eta2_0, self.eta2_1 self.u2_1, self.u2_0 = self.u2_0, self.u2_1 self.v2_1, self.v2_0 = self.v2_0, self.v2_1 """ Enables downloading data from CL device to Python """ def download(self, cl_queue): #Allocate data on the host for result eta1_1 = np.empty((self.ny+2, self.nx+2), dtype=np.float32, order='C') u1_1 = np.empty((self.ny+2, self.nx+1), dtype=np.float32, order='C') v1_1 = np.empty((self.ny+1, self.nx+2), dtype=np.float32, order='C') #Same for layer 2 eta2_1 = np.empty((self.ny+2, self.nx+2), dtype=np.float32, order='C') u2_1 = np.empty((self.ny+2, self.nx+1), dtype=np.float32, order='C') v2_1 = np.empty((self.ny+1, self.nx+2), dtype=np.float32, order='C') #Copy data from device to host cl.enqueue_copy(cl_queue, eta1_1, self.eta1_1) cl.enqueue_copy(cl_queue, u1_1, self.u1_1) cl.enqueue_copy(cl_queue, v1_1, self.v1_1) #Same for layer 2 cl.enqueue_copy(cl_queue, eta2_1, self.eta2_1) cl.enqueue_copy(cl_queue, u2_1, self.u2_1) cl.enqueue_copy(cl_queue, v2_1, self.v2_1) #Return return eta1_1, u1_1, v1_1, eta2_1, u2_1, v2_1 """ Class that solves the SW equations using the Centered in time centered in space scheme """ class CTCS2Layer: """ Initialization routine h1_0: Water depth incl ghost cells, (nx+2)*(ny+2) cells eta1_0: Initial deviation from mean sea level incl ghost cells, (nx+2)*(ny+2) cells u1_0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+2) cells v1_0: Initial momentum along y-axis incl ghost cells, (nx+2)*(ny+1) cells h2_0: Water depth (layer 2) incl ghost cells, (nx+2)*(ny+2) cells eta2_0: Initial deviation from mean sea level (layer 2) incl ghost cells, (nx+2)*(ny+2) cells u2_0: Initial momentum (layer 2) along x-axis incl ghost cells, (nx+1)*(ny+2) cells v2_0: Initial momentum (layer 2) along y-axis incl ghost cells, (nx+2)*(ny+1) cells nx: Number of cells along x-axis ny: Number of cells along y-axis dx: Grid cell spacing along x-axis (20 000 m) dy: Grid cell spacing along y-axis (20 000 m) dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) f: Coriolis parameter (1.2e-4 s^1) r: Bottom friction coefficient (2.4e-3 m/s) r2: Inter-layer friction coefficient (m/s) A: Eddy viscosity coefficient (O(dx)) rho1: Density of upper layer (1025.0 kg / m^3) rho2: Density of lower layer (1000.0 kg / m^3) wind_type: Type of wind stress, 0=Uniform along shore, 1=bell shaped along shore, 2=moving cyclone wind_tau0: Amplitude of wind stress (Pa) wind_alpha: Offshore e-folding length (1/(10*dx) = 5e-6 m^-1) wind_xm: Maximum wind stress for bell shaped wind stress wind_Rc: Distance to max wind stress from center of cyclone (10dx = 200 000 m) wind_x0: Initial x position of moving cyclone (dx*(nx/2) - u0*3600.0*48.0) wind_y0: Initial y position of moving cyclone (dy*(ny/2) - v0*3600.0*48.0) wind_u0: Translation speed along x for moving cyclone (30.0/sqrt(5.0)) wind_v0: Translation speed along y for moving cyclone (-0.5*u0) """ def __init__(self, \ h1_0, eta1_0, u1_0, v1_0, \ h2_0, eta2_0, u2_0, v2_0, \ nx, ny, \ dx, dy, dt, \ g, f, r1, r2, A, \ rho1, rho2, wind_type=99, # "no wind" \ wind_tau0=0, wind_alpha=0, wind_xm=0, wind_Rc=0, \ wind_x0=0, wind_y0=0, \ wind_u0=0, wind_v0=0): #Make sure we get compiler output from OpenCL os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1" #Set which CL device to use os.environ["PYOPENCL_CTX"] = "1" #Create OpenCL context self.cl_ctx = cl.create_some_context() print "Using ", self.cl_ctx.devices[0].name #Create an OpenCL command queue self.cl_queue = cl.CommandQueue(self.cl_ctx) #Get kernels self.u_kernel = self.get_kernel("CTCS2Layer_U_kernel.opencl") self.v_kernel = self.get_kernel("CTCS2Layer_V_kernel.opencl") self.eta_kernel = self.get_kernel("CTCS2Layer_eta_kernel.opencl") #Create data by uploading to device self.cl_data = CTCS2LayerDataCL(self.cl_ctx, h1_0, eta1_0, u1_0, v1_0, h2_0, eta2_0, u2_0, v2_0) #Save input parameters #Notice that we need to specify them in the correct dataformat for the #OpenCL kernel self.nx = np.int32(nx) self.ny = np.int32(ny) self.dx = np.float32(dx) self.dy = np.float32(dy) self.dt = np.float32(dt) self.g = np.float32(g) self.f = np.float32(f) self.r1 = np.float32(r1) self.r2 = np.float32(r2) self.A = np.float32(A) assert(rho1 <= rho2) self.rho1 = np.float32(rho1) self.rho2 = np.float32(rho2) self.wind_type = np.int32(wind_type) self.wind_tau0 = np.float32(wind_tau0) self.wind_alpha = np.float32(wind_alpha) self.wind_xm = np.float32(wind_xm) self.wind_Rc = np.float32(wind_Rc) self.wind_x0 = np.float32(wind_x0) self.wind_y0 = np.float32(wind_y0) self.wind_u0 = np.float32(wind_u0) self.wind_v0 = np.float32(wind_v0) #Initialize time self.t = np.float32(0.0) #Compute kernel launch parameters self.local_size = (8, 8) # WARNING::: MUST MATCH defines of block_width/height in kernels! self.global_size = ( \ int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \ int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \ ) """ Function which steps n timesteps """ def step(self, t_end=0.0): n = int(t_end / self.dt + 1) for i in range(0, n): #Notation: # cl_data.u0 => U^{n-1} before U kernel, U^{n+1} after U kernel # cl_data.u1 => U^{n} # When we call cl_data.swap(), we swap these, so that # cl_data.u0 => U^{n} # cl_data.u1 => U^{n+1} (U kernel has been executed) # Now we are ready for the next time step local_dt = np.float32(min(self.dt, t_end-i*self.dt)) if (local_dt <= 0.0): break self.eta_kernel.computeEtaKernel(self.cl_queue, self.global_size, self.local_size, \ self.nx, self.ny, \ self.dx, self.dy, local_dt, \ \ self.cl_data.eta1_0, self.cl_data.eta1_0_pitch, # eta^{n-1} => eta^{n+1} \ self.cl_data.u1_1, self.cl_data.u1_1_pitch, # U^{n} \ self.cl_data.v1_1, self.cl_data.v1_1_pitch, # V^{n} \ self.cl_data.eta2_0, self.cl_data.eta2_0_pitch, \ self.cl_data.u2_1, self.cl_data.u2_1_pitch, \ self.cl_data.v2_1, self.cl_data.v2_1_pitch) self.u_kernel.computeUKernel(self.cl_queue, self.global_size, self.local_size, \ self.nx, self.ny, \ self.dx, self.dy, local_dt, \ self.g, self.f, \ self.r1, self.r2, \ self.A, \ self.rho1, self.rho2, \ \ self.cl_data.h1_0, self.cl_data.h1_0_pitch, \ self.cl_data.eta1_1, self.cl_data.eta1_1_pitch, # eta^{n} \ self.cl_data.u1_0, self.cl_data.u1_0_pitch, # U^{n-1} => U^{n+1} \ self.cl_data.u1_1, self.cl_data.u1_1_pitch, # U^{n} \ self.cl_data.v1_1, self.cl_data.v1_1_pitch, # V^{n} \ \ self.cl_data.h2_0, self.cl_data.h2_0_pitch, \ self.cl_data.eta2_1, self.cl_data.eta2_1_pitch, \ self.cl_data.u2_0, self.cl_data.u2_0_pitch, \ self.cl_data.u2_1, self.cl_data.u2_1_pitch, \ self.cl_data.v2_1, self.cl_data.v2_1_pitch, \ \ self.wind_type, \ self.wind_tau0, self.wind_alpha, self.wind_xm, self.wind_Rc, \ self.wind_x0, self.wind_y0, \ self.wind_u0, self.wind_v0, \ self.t) self.v_kernel.computeVKernel(self.cl_queue, self.global_size, self.local_size, \ self.nx, self.ny, \ self.dx, self.dy, local_dt, \ self.g, self.f, \ self.r1, self.r2, \ self.A, \ self.rho1, self.rho2, \ \ self.cl_data.h1_0, self.cl_data.h1_0_pitch, \ self.cl_data.eta1_1, self.cl_data.eta1_1_pitch, # eta^{n} \ self.cl_data.u1_1, self.cl_data.u1_1_pitch, # U^{n} \ self.cl_data.v1_0, self.cl_data.v1_0_pitch, # V^{n-1} => V^{n+1} \ self.cl_data.v1_1, self.cl_data.v1_1_pitch, # V^{n} \ \ self.cl_data.h2_0, self.cl_data.h2_0_pitch, \ self.cl_data.eta2_1, self.cl_data.eta2_1_pitch, \ self.cl_data.u2_1, self.cl_data.u2_1_pitch, \ self.cl_data.v2_0, self.cl_data.v2_0_pitch, \ self.cl_data.v2_1, self.cl_data.v2_1_pitch, \ \ self.wind_type, \ self.wind_tau0, self.wind_alpha, self.wind_xm, self.wind_Rc, \ self.wind_x0, self.wind_y0, \ self.wind_u0, self.wind_v0, \ self.t) #After the kernels, swap the data pointers self.cl_data.swap() self.t += local_dt return self.t """ Static function which reads a text file and creates an OpenCL kernel from that """ def get_kernel(self, kernel_filename): #Read the proper program module_path = os.path.dirname(os.path.realpath(__file__)) fullpath = os.path.join(module_path, kernel_filename) with open(fullpath, "r") as kernel_file: kernel_string = kernel_file.read() kernel = cl.Program(self.cl_ctx, kernel_string).build() return kernel def download(self): return self.cl_data.download(self.cl_queue)