debug: add shorter stop time to investigate NaNs

feat: add more save times to mpi testing
feat(array): improve checking the array for NaNs
2025-11-28 07:49:49 +01:00 · 2025-09-03 18:33:05 +02:00 · 2025-09-03 18:10:25 +02:00 · 2025-09-03 18:08:45 +02:00 · 2025-09-03 18:07:48 +02:00 · 2025-09-03 18:07:13 +02:00
9 changed files with 64 additions and 50 deletions
--- a/GPUSimulators/common/arrays/hip/arkawa2d.py
+++ b/GPUSimulators/common/arrays/hip/arkawa2d.py
@ -1,4 +1,5 @@
 import ctypes
+from typing import Union

 import numpy as np
 from hip import hip, hipblas
@ -13,12 +14,28 @@ class HIPArakawaA2D(BaseArakawaA2D):
    A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
    """

-    def __init__(self, stream, nx, ny, halo_x, halo_y, cpu_variables):
+    def __init__(self, stream: hip.ihipStream_t, nx: int, ny: int, halo_x: int, halo_y: int, cpu_variables: list[Union[np.ndarray, None]]):
        """
        Uploads initial data to the GPU device
        """
        super().__init__(stream, nx, ny, halo_x, halo_y, cpu_variables, HIPArray2D)

+        # Variables for ``__sum_array``
+        # TODO should have a way of not hardcoding the dtype
+        dtype = np.float32
+        self.__result_h = np.zeros(1, dtype=dtype)
+        self.__num_bytes = self.__result_h.itemsize
+        self.__result_d = hip_check(hip.hipMalloc(self.__num_bytes))
+        self.__total_sum_d = hip_check(hip.hipMalloc(self.__num_bytes))
+
+        self.__handle = hip_check(hipblas.hipblasCreate())
+
+    def __del__(self):
+        # Cleanup GPU variables in ``__sum_array``
+        hip_check(hipblas.hipblasDestroy(self.__handle))
+        hip_check(hip.hipFree(self.__result_d))
+        hip_check(hip.hipFree(self.__total_sum_d))
+
    def check(self):
        """
        Checks that data is still sane
@ -31,8 +48,7 @@ class HIPArakawaA2D(BaseArakawaA2D):
            if np.isnan(var_sum):
                raise ValueError("Data contains NaN values!")

-    @staticmethod
-    def __sum_array(array: HIPArray2D) -> np.ndarray[tuple[int]]:
+    def __sum_array(self, array: HIPArray2D) -> np.ndarray[tuple[int]]:
        """
        Sum all the elements in HIPArray2D using hipblas.
        Args:
@ -40,35 +56,22 @@ class HIPArakawaA2D(BaseArakawaA2D):
        Returns:
            The sum of all the elements in ``array``.
        """
-        dtype = array.dtype
-        result_h = np.zeros(1, dtype=dtype)
-        num_bytes = dtype.itemsize
-        result_d = hip_check(hip.hipMalloc(num_bytes))
-
-        # Sum the ``data_h`` array using hipblas
-        handle = hip_check(hipblas.hipblasCreate())

        # Using pitched memory, so we need to sum row by row
-        total_sum_d = hip_check(hip.hipMalloc(num_bytes))
-        hip_check(hip.hipMemset(total_sum_d, 0, num_bytes))
+        hip_check(hip.hipMemset(self.__total_sum_d, 0, self.__num_bytes))

        width, height = array.shape

        for y in range(height):
            row_ptr = int(array.data) + y * array.pitch_d

-            hip_check(hipblas.hipblasSasum(handle, width, row_ptr, 1, result_d))
+            hip_check(hipblas.hipblasSasum(self.__handle, width, row_ptr, 1, self.__result_d))

-            hip_check(hipblas.hipblasSaxpy(handle, 1, ctypes.c_float(1.0), result_d, 1, total_sum_d, 1))
-
-            hip_check(hip.hipMemcpy(result_h, total_sum_d, num_bytes, hip.hipMemcpyKind.hipMemcpyDeviceToHost))
+            hip_check(
+                hipblas.hipblasSaxpy(self.__handle, 1, ctypes.c_float(1.0), self.__result_d, 1, self.__total_sum_d, 1))

        # Copy over the result from the device
-        hip_check(hip.hipMemcpy(result_h, total_sum_d, num_bytes, hip.hipMemcpyKind.hipMemcpyDeviceToHost))
+        hip_check(hip.hipMemcpy(self.__result_h, self.__total_sum_d, self.__num_bytes,
+                                hip.hipMemcpyKind.hipMemcpyDeviceToHost))

-        # Cleanup
-        hip_check(hipblas.hipblasDestroy(handle))
-        hip_check(hip.hipFree(result_d))
-        hip_check(hip.hipFree(total_sum_d))
-
-        return result_h
+        return self.__result_h
--- a/GPUSimulators/common/common.py
+++ b/GPUSimulators/common/common.py
@ -185,6 +185,7 @@ def run_simulation(simulator, simulator_args, outfile, save_times, save_var_name
        with tqdm(total=save_times[-1], desc="Simulation progress", unit="sim s", disable=not progress_bar) as pbar:
            # Start simulation loop
            for save_step, t_step in enumerate(t_steps):
+                logger.debug(f"Starting step: {save_step}.")
                t_end = save_step

                # Sanity check simulator
@ -197,8 +198,10 @@ def run_simulation(simulator, simulator_args, outfile, save_times, save_var_name
                profiling_data_sim_runner["start"]["t_full_step"] += time.time()

                # Simulate
+                logger.debug(f"Simulating for {t_step} s.")
                if t_step > 0.0:
                    sim.simulate(t_step, dt, pbar=pbar)
+                logger.debug(f"Completed simulation of {t_step} s.")

                profiling_data_sim_runner["end"]["t_full_step"] += time.time()

@ -208,8 +211,11 @@ def run_simulation(simulator, simulator_args, outfile, save_times, save_var_name
                save_vars = sim.download(download_vars)

                # Save to file
+                logger.debug(f"Saving step [{save_step}] to netCDF.")
                for i, var_name in enumerate(save_var_names):
+                    logger.debug(f"Saving {var_name} ({grid_x0}:{grid_x1}, {grid_y0}:{grid_y1}) to netCDF.")
                    ncvars[var_name][save_step, grid_y0:grid_y1, grid_x0:grid_x1] = save_vars[i]
+                logger.debug(f"Saved step [{save_step}].")

                profiling_data_sim_runner["end"]["t_nc_write"] += time.time()

--- a/GPUSimulators/gpu/cuda_context.py
+++ b/GPUSimulators/gpu/cuda_context.py
@ -110,7 +110,7 @@ class CudaContext(Context):
    def get_module(self, kernel_filename: str,
                   function: str,
                   include_dirs: dict = None,
-                   defines: dict[str: dict] = None,
+                   defines: dict[str, dict] = None,
                   compile_args: dict = None,
                   jit_compile_args: dict = None) -> cuda.Module:
        """
--- a/GPUSimulators/gpu/hip_context.py
+++ b/GPUSimulators/gpu/hip_context.py
@ -64,8 +64,8 @@ class HIPContext(Context):
    def get_module(self, kernel_filename: str,
                   function: str,
                   include_dirs: list[str] = None,
-                   defines: dict[str: int] = None,
-                   compile_args: dict[str: list] = None,
+                   defines: dict[str, int] = None,
+                   compile_args: dict[str, list] = None,
                   jit_compile_args: dict = None):
        """
        Reads a ``.hip`` file and creates a HIP kernel from that.
--- a/GPUSimulators/gpu/hip_handler.py
+++ b/GPUSimulators/gpu/hip_handler.py
@ -21,19 +21,22 @@ class HIPHandler(BaseGPUHandler):

        self.num_bytes = self.cfl_data_h.size * self.cfl_data_h.itemsize
        self.cfl_data = hip_check(hip.hipMalloc(self.num_bytes)).configure(
-            typestr=np.finfo(self.dtype).dtype.name, shape=grid_size
+            typestr=self.cfl_data_h.dtype.str, shape=grid_size
        )

    def __del__(self):
        hip_check(hip.hipFree(self.cfl_data))

-    def prepared_call(self, grid_size, block_size, stream, args):
-        if len(grid_size) < 3:
-            grid_size = (*grid_size, 1)
+    def prepared_call(self, grid_size: tuple[int, int], block_size: tuple[int, int, int], stream: hip.ihipStream_t,
+                      args: list):
+        grid = hip.dim3(*grid_size)
+        block = hip.dim3(*block_size)

        for i in range(len(args)):
            val = args[i]
-            if isinstance(val, int) or isinstance(val, np.int32):
+            if isinstance(val, np.int64):
+                args[i] = ctypes.c_int64(val)
+            elif isinstance(val, int) or isinstance(val, np.int32):
                args[i] = ctypes.c_int(val)
            elif isinstance(val, float) or isinstance(val, np.float32):
                args[i] = ctypes.c_float(val)
@ -42,29 +45,31 @@ class HIPHandler(BaseGPUHandler):

        hip_check(hip.hipModuleLaunchKernel(
            self.kernel,
-            *grid_size,
-            *block_size,
-            0,
-            stream,
-            None,
-            args
+            *grid,
+            *block,
+            sharedMemBytes=0,
+            stream=stream,
+            kernelParams=None,
+            extra=args
        ))

-
-    def array_fill(self, data, stream):
+    def array_fill(self, data: float, stream: hip.ihipStream_t):
        self.cfl_data_h.fill(data)

        hip_check(
            hip.hipMemcpyAsync(self.cfl_data, self.cfl_data_h, self.num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice,
                               stream))

-    def array_min(self, stream):
+    def array_min(self, stream: hip.ihipStream_t) -> float:
        handle = hip_check(hipblas.hipblasCreate())

-        value = np.empty(1, self.dtype)
-        hip_check(hipblas.hipblasIsamin(handle, self.cfl_data.size, self.cfl_data, 1, value))
+        value_h = np.empty(1, self.dtype)
+        value_d = hip_check(hip.hipMalloc(value_h.itemsize))
+
+        hip_check(hipblas.hipblasIsamin(handle, self.cfl_data.size, self.cfl_data, 1, value_d))
        hip_check(hipblas.hipblasDestroy(handle))

-        hip_check(hip.hipMemcpy(value, self.cfl_data, self.cfl_data_h.itemsize, hip.hipMemcpyKind.hipMemcpyDeviceToHost))
+        hip_check(
+            hip.hipMemcpy(value_h, self.cfl_data, self.cfl_data_h.itemsize, hip.hipMemcpyKind.hipMemcpyDeviceToHost))

        return value[0]
--- a/GPUSimulators/mpi/simulator.py
+++ b/GPUSimulators/mpi/simulator.py
@ -80,16 +80,16 @@ class BaseMPISimulator(BaseSimulator):
        })
        gi, gj = grid.get_coordinate()
        # print("gi: " + str(gi) + ", gj: " + str(gj))
-        if gi == 0 and boundary_conditions.west != BoundaryCondition.Type.Periodic:
+        if (gi == 0 and boundary_conditions.west != BoundaryCondition.Type.Periodic):
            self.west = None
            new_boundary_conditions.west = boundary_conditions.west
-        if gj == 0 and boundary_conditions.south != BoundaryCondition.Type.Periodic:
+        if (gj == 0 and boundary_conditions.south != BoundaryCondition.Type.Periodic):
            self.south = None
            new_boundary_conditions.south = boundary_conditions.south
-        if gi == grid.x - 1 and boundary_conditions.east != BoundaryCondition.Type.Periodic:
+        if (gi == grid.x - 1 and boundary_conditions.east != BoundaryCondition.Type.Periodic):
            self.east = None
            new_boundary_conditions.east = boundary_conditions.east
-        if gj == grid.y - 1 and boundary_conditions.north != BoundaryCondition.Type.Periodic:
+        if (gj == grid.y - 1 and boundary_conditions.north != BoundaryCondition.Type.Periodic):
            self.north = None
            new_boundary_conditions.north = boundary_conditions.north
        sim.set_boundary_conditions(new_boundary_conditions)
--- a/GPUSimulators/simulator/boundary.py
+++ b/GPUSimulators/simulator/boundary.py
@ -42,7 +42,7 @@ class BoundaryCondition(object):
        Periodic = 2,
        Reflective = 3

-    def __init__(self, types: dict[str: Type.Reflective]=None):
+    def __init__(self, types: dict[str, Type]=None):
        """
        Constructor
        """
--- a/GPUSimulators/simulator/simulator.py
+++ b/GPUSimulators/simulator/simulator.py
@ -104,7 +104,7 @@ class BaseSimulator(object):
            self.dt = dt

        if tolerance is None:
-            tolerance = 0.000000001
+            tolerance = 0.00001

        while self.sim_time() < t_end:
            # Prevent an infinite loop from occurring from tiny numbers
--- a/mpi_testing_hip.py
+++ b/mpi_testing_hip.py
@ -116,7 +116,7 @@ gamma = 1.4
 # save_times = np.linspace(0, 0.000009, 2)
 # save_times = np.linspace(0, 0.000099, 11)
 # save_times = np.linspace(0, 0.000099, 2)
-save_times = np.linspace(0, 0.1, 5)
+save_times = np.linspace(0, 0.1, 21)
 outfile = "mpi_out.nc4"
 save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
Author	SHA1	Message	Date
Anthony Berg	bf2681fbf0	debug: add shorter stop time to investigate NaNs	2025-09-03 18:33:05 +02:00
Anthony Berg	a1a653e6d8	feat: add more save times to mpi testing	2025-09-03 18:10:25 +02:00
Anthony Berg	87474dcb20	feat(array): improve checking the array for NaNs	2025-09-03 18:08:45 +02:00
Anthony Berg	26c0eab7c8	fix(sim): add more debugging logs in run_simulation	2025-09-03 18:07:48 +02:00
Anthony Berg	833f5bf997	fix(gpu): typing for dictionary in parameter	2025-09-03 18:07:13 +02:00
Anthony Berg	cd89a343bf	feat(sim): increase default tolerance for end time in simulate	2025-09-03 18:06:43 +02:00
Anthony Berg	97c2fd47e3	fix(gpu): correct syntax for dictionary typing	2025-09-03 18:04:43 +02:00