Source code for swiftsimio.visualisation.projection_backends.gpu

from math import sqrt, ceil
from numpy import float64, float32, int32, ndarray
from swiftsimio.optional_packages import (
    CUDA_AVAILABLE,
    cuda_jit,
    CudaSupportError,
    cuda,
)

kernel_gamma = float32(1.897367)


[docs]@cuda_jit("float32(float32, float32)", device=True)
def kernel(r: float32, H: float32):
    """
    Single precision kernel implementation for swiftsimio.

    This is the Wendland-C2 kernel as shown in Denhen & Aly (2012) [1]_.

    Parameters
    ----------

    r : float32
        radius used in kernel computation

    H : float32
        kernel width (i.e. radius of compact support for the kernel)

    Returns
    -------

    float32
        Contribution to the density by the particle

    References
    ----------

    .. [1] Dehnen W., Aly H., 2012, MNRAS, 425, 1068

    Notes
    -----

    This is the cuda-compiled version of the kernel, designed for use
    within the gpu backend. It has no double precision cousin.
    """
    kernel_constant = float32(2.22817109)

    inverse_H = 1.0 / H
    ratio = r * inverse_H

    kernel = 0.0

    if ratio < 1.0:
        one_minus_ratio = 1.0 - ratio
        one_minus_ratio_2 = one_minus_ratio * one_minus_ratio
        one_minus_ratio_4 = one_minus_ratio_2 * one_minus_ratio_2

        kernel = max(one_minus_ratio_4 * (1.0 + 4.0 * ratio), 0.0)

        kernel *= kernel_constant * inverse_H * inverse_H

    return kernel


[docs]@cuda_jit(
    "void(float64[:], float64[:], float32[:], float32[:], float64, float64, float32[:,:])"
)
def scatter_gpu(
    x: float64,
    y: float64,
    m: float32,
    h: float32,
    box_x: float64,
    box_y: float64,
    img: float32,
):
    """
    Creates a weighted scatter plot

    Computes contributions to from particles with positions
    (`x`,`y`) with smoothing lengths `h` weighted by quantities `m`.
    This includes periodic boundary effects.

    Parameters
    ----------

    x : np.array[float64]
        array of x-positions of the particles. Must be bounded by [0, 1].

    y : np.array[float64]
        array of y-positions of the particles. Must be bounded by [0, 1].

    m : np.array[float32]
        array of masses (or otherwise weights) of the particles

    h : np.array[float32]
        array of smoothing lengths of the particles

    box_x: float64
        box size in x, in the same rescaled length units as x and y. Used
        for periodic wrapping.

    box_y: float64
        box size in y, in the same rescaled length units as x and y. Used
        for periodic wrapping.

    img : np.array[float32]
        The output image.

    Notes
    -----

    Explicitly defining the types in this function allows
    for a performance improvement. This is the cuda version,
    and as such can only be ran on systems with a supported
    GPU. Do not call this where cuda is not available (checks
    can be performed using
    ``swiftsimio.optional_packages.CUDA_AVAILABLE``)
    """
    # Output array for our image
    res = img.shape[0]
    maximal_array_index = int32(res) - 1

    # Change that integer to a float, we know that our x, y are bounded
    # by [0, 1].
    float_res = float32(res)
    pixel_width = 1.0 / float_res

    # We need this for combining with the x_pos and y_pos variables.
    float_res_64 = float64(res)

    # Pre-calculate this constant for use with the above
    inverse_cell_area = res * res

    # get the particle index and the x and y index of its periodic copy
    i, dx, dy = cuda.grid(3)
    if i < len(x):
        # Get the correct particle
        mass = m[i]
        hsml = h[i]
        x_pos = x[i] + (dx - 1.0) * box_x
        y_pos = y[i] + (dy - 1.0) * box_y

        # Calculate the cell that this particle; use the 64 bit version of the
        # resolution as this is the same type as the positions
        particle_cell_x = int32(float_res_64 * x_pos)
        particle_cell_y = int32(float_res_64 * y_pos)

        # SWIFT stores hsml as the FWHM.
        kernel_width = kernel_gamma * hsml

        # The number of cells that this kernel spans
        cells_spanned = int32(1.0 + kernel_width * float_res)

        if (
            particle_cell_x + cells_spanned < 0
            or particle_cell_x - cells_spanned > maximal_array_index
            or particle_cell_y + cells_spanned < 0
            or particle_cell_y - cells_spanned > maximal_array_index
        ):
            # Can happily skip this particle
            return

        if cells_spanned <= 1:
            # Easygame, gg
            if (
                particle_cell_x >= 0
                and particle_cell_x <= maximal_array_index
                and particle_cell_y >= 0
                and particle_cell_y <= maximal_array_index
            ):
                cuda.atomic.add(
                    img, (particle_cell_x, particle_cell_y), mass * inverse_cell_area
                )
        else:
            # Now we loop over the square of cells that the kernel lives in
            for cell_x in range(
                # Ensure that the lowest x value is 0, otherwise we'll segfault
                max(0, particle_cell_x - cells_spanned),
                # Ensure that the highest x value lies within the array bounds,
                # otherwise we'll segfault (oops).
                min(particle_cell_x + cells_spanned + 1, maximal_array_index + 1),
            ):
                # The distance in x to our new favourite cell
                # remember that our x, y are all in a box of [0, 1]
                # calculate the distance to the cell center
                distance_x = (float32(cell_x) + 0.5) * pixel_width
                distance_x -= float32(x_pos)
                distance_x_2 = distance_x * distance_x
                for cell_y in range(
                    max(0, particle_cell_y - cells_spanned),
                    min(particle_cell_y + cells_spanned + 1, maximal_array_index + 1),
                ):
                    distance_y = (float32(cell_y) + 0.5) * pixel_width
                    distance_y -= float32(y_pos)
                    distance_y_2 = distance_y * distance_y

                    r = sqrt(distance_x_2 + distance_y_2)

                    kernel_eval = kernel(r, kernel_width)

                    cuda.atomic.add(img, (cell_x, cell_y), mass * kernel_eval)


[docs]def scatter(
    x: float64,
    y: float64,
    m: float32,
    h: float32,
    res: int,
    box_x: float64 = 0.0,
    box_y: float64 = 0.0,
) -> ndarray:
    """
    Parallel implementation of scatter

    Creates a weighted scatter plot. Computes contributions from
    particles with positions (`x`,`y`) with smoothing lengths `h`
    weighted by quantities `m`.
    This includes periodic boundary effects.

    Parameters
    ----------
    x : np.array[float64]
        array of x-positions of the particles. Must be bounded by [0, 1].

    y : np.array[float64]
        array of y-positions of the particles. Must be bounded by [0, 1].

    m : np.array[float32]
        array of masses (or otherwise weights) of the particles

    h : np.array[float32]
        array of smoothing lengths of the particles

    res : int
        the number of pixels along one axis, i.e. this returns a square
        of res * res.

    box_x: float64
        box size in x, in the same rescaled length units as x and y. Used
        for periodic wrapping.

    box_y: float64
        box size in y, in the same rescaled length units as x and y. Used
        for periodic wrapping.

    Returns
    -------

    np.array[float32, float32, float32]
        pixel grid of quantity

    See Also
    --------

    scatter : Creates 2D scatter plot from SWIFT data

    Notes
    -----

    Explicitly defining the types in this function allows
    a performance improvement.
    """
    if not CUDA_AVAILABLE or cuda is None:
        raise CudaSupportError(
            "Unable to load the CUDA extension to numba. This function "
            "is only available on systems with supported GPUs."
        )

    output = cuda.device_array((res, res), dtype=float32)
    output[:] = 0

    n_part = len(x)
    if box_x == 0.0:
        n_xshift = 1
    else:
        n_xshift = 3
    if box_y == 0.0:
        n_yshift = 1
    else:
        n_yshift = 3
    # set up a 3D grid:
    # the first dimension are the particles
    # the second and third dimension are the periodic
    # copies for each particle
    threads_per_block = (16, 1, 1)
    blocks_per_grid = (
        ceil(n_part / threads_per_block[0]),
        n_xshift // threads_per_block[1],
        n_yshift // threads_per_block[2],
    )
    scatter_gpu[blocks_per_grid, threads_per_block](x, y, m, h, box_x, box_y, output)

    return output.copy_to_host()


scatter_parallel = scatter