# -*- coding: utf-8 -*-

"""
This python module implements the different helper functions and 
classes

Copyright (C) 2018  SINTEF ICT

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import os
import gc
import numpy as np
import logging
from socket import gethostname
from tqdm.auto import tqdm

import pycuda.driver as cuda

from GPUSimulators import Simulator
from GPUSimulators.common import common, Timer
from GPUSimulators.gpu import CudaContext


def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
    """
    Runs a benchmark, and returns the number of megacells achieved
    """

    logger = logging.getLogger(__name__)

    # Initialize simulator
    try:
        sim = simulator(**arguments)
    except:
        # An exception raised - not possible to continue
        logger.debug(f"Failed creating {simulator.__name__} with arguments {str(arguments)}")
        # raise RuntimeError("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
        return np.nan

    # Create timer events
    start = cuda.Event()
    end = cuda.Event()

    # Warmup
    for i in range(warmup_timesteps):
        sim.substep(sim.dt, i)

    # Run simulation with timer
    start.record(sim.stream)
    for i in range(timesteps):
        sim.substep(sim.dt, i)
    end.record(sim.stream)

    # Synchronize end event
    end.synchronize()

    # Compute megacells
    gpu_elapsed = end.time_since(start) * 1.0e-3
    megacells = (sim.nx * sim.ny * timesteps / (1000 * 1000)) / gpu_elapsed

    # Sanity check solution
    h, hu, hv = sim.download()
    sane = True
    sane = sane and sanity_check(0.3, 0.7)
    sane = sane and sanity_check(-0.2, 0.2)
    sane = sane and sanity_check(-0.2, 0.2)

    if sane:
        logger.debug("%s [%d x %d] succeeded: %f megacells, gpu elapsed %f", simulator.__name__,
                     arguments["block_width"], arguments["block_height"], megacells, gpu_elapsed)
        return megacells
    else:
        logger.debug("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"],
                     arguments["block_height"], gpu_elapsed)
        # raise RuntimeError("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], gpu_elapsed)
        return np.nan


def gen_test_data(nx, ny, g):
    """
    Generates test dataset
    """

    width = 100.0
    height = 100.0
    dx = width / float(nx)
    dy = height / float(ny)

    x_center = dx * nx / 2.0
    y_center = dy * ny / 2.0

    # Create a gaussian "dam break" that will not form shocks
    size = width / 5.0
    dt = 10 ** 10

    h = np.zeros((ny, nx), dtype=np.float32)
    hu = np.zeros((ny, nx), dtype=np.float32)
    hv = np.zeros((ny, nx), dtype=np.float32)

    extent = 1.0 / np.sqrt(2.0)
    x = (dx * (np.arange(0, nx, dtype=np.float32) + 0.5) - x_center) / size
    y = (dy * (np.arange(0, ny, dtype=np.float32) + 0.5) - y_center) / size
    xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
    r = np.minimum(1.0, np.sqrt(xv ** 2 + yv ** 2))
    xv = None
    yv = None
    gc.collect()

    # Generate highres
    cos = np.cos(np.pi * r)
    h = 0.5 + 0.1 * 0.5 * (1.0 + cos)
    hu = 0.1 * 0.5 * (1.0 + cos)
    hv = hu.copy()

    scale = 0.7
    max_h_estimate = 0.6
    max_u_estimate = 0.1 * np.sqrt(2.0)
    dx = width / nx
    dy = height / ny
    dt = scale * min(dx, dy) / (max_u_estimate + np.sqrt(g * max_h_estimate))

    return h, hu, hv, dx, dy, dt


def sanity_check(variable, bound_min, bound_max):
    """
    Checks that a variable is "sane"
    """

    maxval = np.amax(variable)
    minval = np.amin(variable)
    if (np.isnan(maxval)
            or np.isnan(minval)
            or maxval > bound_max
            or minval < bound_min):
        return False
    else:
        return True


def benchmark_single_simulator(simulator, arguments, block_widths, block_heights):
    """
    Runs a set of benchmarks for a single simulator
    """

    logger = logging.getLogger(__name__)

    megacells = np.empty((len(block_heights), len(block_widths)))
    megacells.fill(np.nan)

    logger.debug("Running %d benchmarks with %s", len(block_heights) * len(block_widths), simulator.__name__)

    sim_arguments = arguments.copy()

    with Timer(simulator.__name__) as t:
        for j, block_height in enumerate(tqdm(block_heights, desc='Autotuner Progress')):
            sim_arguments.update({'block_height': block_height})
            for i, block_width in enumerate(tqdm(block_widths, desc=f'Iteration {j} Progress', leave=False)):
                sim_arguments.update({'block_width': block_width})
                megacells[j, i] = run_benchmark(sim_arguments)

    logger.debug("Completed %s in %f seconds", simulator.__name__, t.secs)

    return megacells


class Autotuner:
    def __init__(self,
                 nx=2048, ny=2048,
                 block_widths=range(8, 32, 1),
                 block_heights=range(8, 32, 1)):
        logger = logging.getLogger(__name__)
        self.filename = f"autotuning_data_{gethostname()}.npz"
        self.nx = nx
        self.ny = ny
        self.block_widths = block_widths
        self.block_heights = block_heights
        self.performance = {}

    def benchmark(self, simulator, force=False):
        logger = logging.getLogger(__name__)

        # Run through simulators and benchmark
        key = str(simulator.__name__)
        logger.info(f"Benchmarking {key} to {self.filename}")

        # If this simulator has been benchmarked already, skip it
        if force == False and os.path.isfile(self.filename):
            with np.load(self.filename) as data:
                if key in data["simulators"]:
                    logger.info(f"{key} already benchmarked - skipping")
                    return

        # Set arguments to send to the simulators during construction
        context = CudaContext(autotuning=False)
        g = 9.81
        h0, hu0, hv0, dx, dy, dt = gen_test_data(ny=self.ny, g=g)
        arguments = {
            'context': context,
            'h0': h0, 'hu0': hu0, 'hv0': hv0,
            'nx': self.nx, 'ny': self.ny,
            'dx': dx, 'dy': dy, 'dt': 0.9 * dt,
            'g': g,
            'compile_opts': ['-Wno-deprecated-gpu-targets']
        }

        # Load existing data into memory
        benchmark_data = {
            "simulators": [],
        }
        if os.path.isfile(self.filename):
            with np.load(self.filename) as data:
                for k, v in data.items():
                    benchmark_data[k] = v

        # Run benchmark
        benchmark_data[key + "_megacells"] = benchmark_single_simulator(arguments, self.block_widths,
                                                                        self.block_heights)
        benchmark_data[key + "_block_widths"] = self.block_widths
        benchmark_data[key + "_block_heights"] = self.block_heights
        benchmark_data[key + "_arguments"] = str(arguments)

        existing_sims = benchmark_data["simulators"]
        if isinstance(existing_sims, np.ndarray):
            existing_sims = existing_sims.tolist()
        if key not in existing_sims:
            benchmark_data["simulators"] = existing_sims + [key]

        # Save to file
        np.savez_compressed(self.filename, **benchmark_data)

    def get_peak_performance(self, simulator):
        """
        Function which reads a numpy file with autotuning data
        and reports the maximum performance and block size
        """

        logger = logging.getLogger(__name__)

        assert issubclass(simulator, Simulator.BaseSimulator)
        key = simulator.__name__

        if key in self.performance:
            return self.performance[key]
        else:
            # Run simulation if required
            if not os.path.isfile(self.filename):
                logger.debug(f"Could not get autotuned peak performance for {key}: benchmarking")
                self.benchmark(simulator)

            with np.load(self.filename) as data:
                if key not in data['simulators']:
                    logger.debug(f"Could not get autotuned peak performance for {key}: benchmarking")
                    data.close()
                    self.benchmark(simulator)
                    data = np.load(self.filename)

                def find_max_index(megacells):
                    max_index = np.nanargmax(megacells)
                    return np.unravel_index(max_index, megacells.shape)

                megacells = data[key + '_megacells']
                block_widths = data[key + '_block_widths']
                block_heights = data[key + '_block_heights']
                j, i = find_max_index(megacells)

                self.performance[key] = {"block_width": block_widths[i],
                                         "block_height": block_heights[j],
                                         "megacells": megacells[j, i]}
                logger.debug(f"Returning {self.performance[key]} as peak performance parameters")
                return self.performance[key]

            # This should never happen
            raise "Something wrong: Could not get autotuning data!"
            return None