kk_contour_net_shallow/RenderStimuli/contours.py

# %%
#
# contours.py
#
# Tools for contour integration studies
#
# Version 2.0, 28.04.2023:
#   phases can now be randomized...
#

#
# Coordinate system assumptions:
#
# for arrays:
#   [..., HEIGHT, WIDTH], origin is on TOP LEFT
#   HEIGHT indices *decrease* with increasing y-coordinates (reversed)
#   WIDTH indices *increase* with increasing x-coordinates (normal)
#
# Orientations:
#   0 is horizontal, orientation *increase* counter-clockwise
#   Corner elements, quantified by [dir_source, dir_change]:
#       - consist of two legs
#       - contour *enters* corner from *source direction* at one leg
#         and goes from border to its center...
#       - contour path changes by *direction change* and goes
#         from center to the border
#

import torch
import time
import matplotlib.pyplot as plt
import math
import scipy.io
import numpy as np
import os

torch_device = "cuda"
default_dtype = torch.float32
torch.set_default_dtype(default_dtype)
torch.device(torch_device)


#
# performs a coordinate transform (rotation with phi around origin)
# rotation is performed CLOCKWISE with increasing phi
#
# remark: rotating a mesh grid by phi and orienting an image element
# along the new x-axis is EQUIVALENT to rotating the image element
# by -phi (so this realizes a rotation COUNTER-CLOCKWISE with
# increasing phi)
#
def rotate_CW(x: torch.Tensor, y: torch.Tensor, phi: torch.float32):
    xr = +x * torch.cos(phi) + y * torch.sin(phi)
    yr = -x * torch.sin(phi) + y * torch.cos(phi)

    return xr, yr


#
# renders a Gabor with (or without) corner
#
def gaborner(
    r_gab: int,  # radius, size will be 2*r_gab+1
    dir_source: float,  # contour enters in this dir
    dir_change: float,  # contour turns around by this dir
    lambdah: float,  # wavelength of Gabor
    sigma: float,  # half-width of Gabor
    phase: float,  # phase of Gabor
    normalize: bool,  # normalize patch to zero
    torch_device: str,  # GPU or CPU...
) -> torch.Tensor:
    # incoming dir: change to outgoing dir
    dir1 = dir_source + torch.pi
    nook = dir_change - torch.pi

    # create coordinate grids
    d_gab = 2 * r_gab + 1
    x = -r_gab + torch.arange(d_gab, device=torch_device)
    yg, xg = torch.meshgrid(x, x, indexing="ij")

    # put into tensor for performing vectorized scalar products
    xyg = torch.zeros([d_gab, d_gab, 1, 2], device=torch_device)
    xyg[:, :, 0, 0] = xg
    xyg[:, :, 0, 1] = yg

    # create Gaussian hull
    gauss = torch.exp(-(xg**2 + yg**2) / 2 / sigma**2)
    gabor_corner = gauss.clone()

    if (dir_change == 0) or (dir_change == torch.pi):
        # handle special case of straight Gabor or change by 180 deg

        # vector orth to Gabor axis
        ev1_orth = torch.tensor(
            [math.cos(-dir1 + math.pi / 2), math.sin(-dir1 + math.pi / 2)],
            device=torch_device,
        )
        # project coords to orth vector to get distance
        legs = torch.cos(
            2
            * torch.pi
            * torch.matmul(xyg, ev1_orth.unsqueeze(1).unsqueeze(0).unsqueeze(0))
            / lambdah
            + phase
        )
        gabor_corner *= legs[:, :, 0, 0]

    else:
        dir2 = dir1 + nook

        # compute separation line between corner's legs
        ev1 = torch.tensor([math.cos(-dir1), math.sin(-dir1)], device=torch_device)
        ev2 = torch.tensor([math.cos(-dir2), math.sin(-dir2)], device=torch_device)
        v_towards_1 = (ev1 - ev2).unsqueeze(1).unsqueeze(0).unsqueeze(0)

        # which coords belong to which leg?
        which_side = torch.matmul(xyg, v_towards_1)[:, :, 0, 0]
        towards_1y, towards_1x = torch.where(which_side > 0)
        towards_2y, towards_2x = torch.where(which_side <= 0)

        # compute orth distance to legs
        side_sign = -1 + 2 * ((dir_change % 2 * torch.pi) > torch.pi)
        ev12 = ev1 + ev2
        v1_orth = ev12 - ev1 * torch.matmul(ev12, ev1)
        v2_orth = ev12 - ev2 * torch.matmul(ev12, ev2)
        ev1_orth = side_sign * v1_orth / torch.sqrt((v1_orth**2).sum())
        ev2_orth = side_sign * v2_orth / torch.sqrt((v2_orth**2).sum())

        leg1 = torch.cos(
            2
            * torch.pi
            * torch.matmul(xyg, ev1_orth.unsqueeze(1).unsqueeze(0).unsqueeze(0))
            / lambdah
            + phase
        )
        leg2 = torch.cos(
            2
            * torch.pi
            * torch.matmul(xyg, ev2_orth.unsqueeze(1).unsqueeze(0).unsqueeze(0))
            / lambdah
            + phase
        )
        gabor_corner[towards_1y, towards_1x] *= leg1[towards_1y, towards_1x, 0, 0]
        gabor_corner[towards_2y, towards_2x] *= leg2[towards_2y, towards_2x, 0, 0]

    # depending on phase, Gabor might not be normalized...
    if normalize:
        s = gabor_corner.sum()
        s0 = gauss.sum()
        gabor_corner -= s / s0 * gauss

    return gabor_corner


#
# creates a filter bank of Gabor corners
#
# outputs:
#   filters: [n_source, n_change, HEIGHT, WIDTH]
#   dirs_source: [n_source]
#   dirs_change: [n_change]
#
def gaborner_filterbank(
    r_gab: int,  # radius, size will be 2*r_gab+1
    n_source: int,  # number of source orientations
    n_change: int,  # number of direction changes
    lambdah: float,  # wavelength of Gabor
    sigma: float,  # half-width of Gabor
    phase: float,  # phase of Gabor
    normalize: bool,  # normalize patch to zero
    torch_device: str,  # GPU or CPU...
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    kernels = torch.zeros(
        [n_source, n_change, 2 * r_gab + 1, 2 * r_gab + 1],
        device=torch_device,
        requires_grad=False,
    )
    dirs_source = 2 * torch.pi * torch.arange(n_source, device=torch_device) / n_source
    dirs_change = 2 * torch.pi * torch.arange(n_change, device=torch_device) / n_change

    for i_source in range(n_source):
        for i_change in range(n_change):
            gabor_corner = gaborner(
                r_gab=r_gab,
                dir_source=dirs_source[i_source],
                dir_change=dirs_change[i_change],
                lambdah=lambdah,
                sigma=sigma,
                phase=phase,
                normalize=normalize,
                torch_device=torch_device,
            )
            kernels[i_source, i_change] = gabor_corner

            # check = torch.isnan(gabor_corner).sum()
            # if check > 0:
            #     print(i_source, i_change, check)
            #     kernels[i_source, i_change] = 1

    return kernels, dirs_source, dirs_change


def discretize_stimuli(
    posori,
    x_range: tuple,
    y_range: tuple,
    scale_factor: float,
    r_gab_PIX: int,
    n_source: int,
    n_change: int,
    torch_device: str,
    n_phase: int = 1,
) -> torch.Tensor:
    # check correct input size
    s = posori.shape
    assert len(s) == 2, "posori should be NDARRAY with N x 1 entries"
    assert s[1] == 1, "posori should be NDARRAY with N x 1 entries"

    # determine size of (extended) canvas
    x_canvas_PIX = torch.tensor(
        (x_range[1] - x_range[0]) * scale_factor, device=torch_device
    ).ceil()
    y_canvas_PIX = torch.tensor(
        (y_range[1] - y_range[0]) * scale_factor, device=torch_device
    ).ceil()
    x_canvas_ext_PIX = int(x_canvas_PIX + 2 * r_gab_PIX)
    y_canvas_ext_PIX = int(y_canvas_PIX + 2 * r_gab_PIX)

    # get number of contours
    n_contours = s[0]
    index_phasrcchg = []
    index_y = []
    index_x = []
    for i_contour in range(n_contours):
        x_y_src_chg = torch.asarray(posori[i_contour, 0][1:, :].copy())
        x_y_src_chg[2] += torch.pi

        # if i_contour == 0:
        #     print(x_y_src_chg[2][:3])

        # compute integer coordinates and find all visible elements
        x = ((x_y_src_chg[0] - x_range[0]) * scale_factor + r_gab_PIX).type(torch.long)
        y = y_canvas_ext_PIX - (
            (x_y_src_chg[1] - y_range[0]) * scale_factor + r_gab_PIX
        ).type(torch.long)
        i_visible = torch.where(
            (x >= 0) * (y >= 0) * (x < x_canvas_ext_PIX) * (y < y_canvas_ext_PIX)
        )[0]

        # compute integer (changes of) directions
        i_source = (
            ((((x_y_src_chg[2]) / (2 * torch.pi)) + 1 / (2 * n_source)) % 1) * n_source
        ).type(torch.long)
        i_change = (
            (((x_y_src_chg[3] / (2 * torch.pi)) + 1 / (2 * n_change)) % 1) * n_change
        ).type(torch.long)

        i_phase = torch.randint(n_phase, i_visible.size())
        index_phasrcchg.append(
            (i_phase * n_source + i_source[i_visible]) * n_change + i_change[i_visible]
        )
        # index_change.append(i_change[i_visible])
        index_y.append(y[i_visible])
        index_x.append(x[i_visible])

    return (
        index_phasrcchg,
        index_x,
        index_y,
        x_canvas_ext_PIX,
        y_canvas_ext_PIX,
    )


def render_stimulus(
    kernels, index_element, index_y, index_x, y_canvas, x_canvas, torch_device
):
    s = kernels.shape
    kx = s[-1]
    ky = s[-2]

    stimulus = torch.zeros((y_canvas + ky - 1, x_canvas + kx - 1), device=torch_device)
    n = index_element.size()[0]
    for i in torch.arange(n, device=torch_device):
        x = index_x[i]
        y = index_y[i]
        stimulus[y : y + ky, x : x + kx] += kernels[index_element[i]]

    return stimulus[ky - 1 : -(ky - 1), kx - 1 : -(kx - 1)]


if __name__ == "__main__":
    VERBOSE = True
    BENCH_CONVOLVE = True
    BENCH_GPU = True
    BENCH_CPU = True
    BENCH_DAVID = True

    print("Testing contour rendering speed:")
    print("================================")

    # load contours, multiplex coordinates to simulate a larger set of contours
    n_multiplex = 1
    mat = scipy.io.loadmat("z.mat")
    posori = np.tile(mat["z"], (n_multiplex, 1))
    n_contours = posori.shape[0]
    print(f"Processing {n_contours} contour stimuli")

    # how many contours to render simultaneously?
    n_simultaneous = 5
    n_simultaneous_chunks, n_remaining = divmod(n_contours, n_simultaneous)
    assert n_remaining == 0, "Check parameters for simultaneous contour rendering!"

    # repeat some times for speed testing
    n_repeat = 10
    t_dis = torch.zeros((n_repeat + 2), device=torch_device)
    t_con = torch.zeros((n_repeat + 2), device=torch_device)
    t_rsg = torch.zeros((n_repeat + 2), device=torch_device)
    t_rsc = torch.zeros((n_repeat + 2), device="cpu")
    t_rsd = torch.zeros((n_repeat + 2), device="cpu")

    # cutout for stimuli, and gabor parameters
    x_range = [140, 940]
    y_range = [140, 940]
    d_gab = 40
    lambdah = 12
    sigma = 8
    phase = 0.0
    normalize = True

    # scale to convert coordinates to pixel values
    scale_factor = 0.25

    # number of directions for dictionary
    n_source = 32
    n_change = 32

    # convert sizes to pixel units
    lambdah_PIX = lambdah * scale_factor
    sigma_PIX = sigma * scale_factor
    r_gab_PIX = int(d_gab * scale_factor / 2)
    d_gab_PIX = r_gab_PIX * 2 + 1

    # make filterbank
    kernels, dirs_source, dirs_change = gaborner_filterbank(
        r_gab=r_gab_PIX,
        n_source=n_source,
        n_change=n_change,
        lambdah=lambdah_PIX,
        sigma=sigma_PIX,
        phase=phase,
        normalize=normalize,
        torch_device=torch_device,
    )
    kernels = kernels.reshape([1, n_source * n_change, d_gab_PIX, d_gab_PIX])
    kernels_flip = kernels.flip(dims=(-1, -2))

    # define "network" and put to cuda
    conv = torch.nn.Conv2d(
        in_channels=n_source * n_change,
        out_channels=1,
        kernel_size=d_gab_PIX,
        stride=1,
        device=torch_device,
    )
    conv.weight.data = kernels_flip

    print("Discretizing START!!!")
    t_dis[0] = time.perf_counter()
    for i_rep in range(n_repeat):
        # discretize
        (
            index_srcchg,
            index_x,
            index_y,
            x_canvas,
            y_canvas,
        ) = discretize_stimuli(
            posori=posori,
            x_range=x_range,
            y_range=y_range,
            scale_factor=scale_factor,
            r_gab_PIX=r_gab_PIX,
            n_source=n_source,
            n_change=n_change,
            torch_device=torch_device,
        )
        t_dis[i_rep + 1] = time.perf_counter()
    t_dis[-1] = time.perf_counter()
    print("Discretizing END!!!")

    if BENCH_CONVOLVE:
        print("Allocating!")
        stimuli = torch.zeros(
            [n_simultaneous, n_source * n_change, y_canvas, x_canvas],
            device=torch_device,
            requires_grad=False,
        )

        print("Generation by CONVOLUTION start!")
        t_con[0] = time.perf_counter()
        for i_rep in torch.arange(n_repeat):
            for i_simultaneous_chunks in torch.arange(n_simultaneous_chunks):
                i_ofs = i_simultaneous_chunks * n_simultaneous

                for i_sim in torch.arange(n_simultaneous):
                    stimuli[
                        i_sim,
                        index_srcchg[i_sim + i_ofs],
                        index_y[i_sim + i_ofs],
                        index_x[i_sim + i_ofs],
                    ] = 1

                output = conv(stimuli)

                for i_sim in range(n_simultaneous):
                    stimuli[
                        i_sim,
                        index_srcchg[i_sim + i_ofs],
                        index_y[i_sim + i_ofs],
                        index_x[i_sim + i_ofs],
                    ] = 0

            t_con[i_rep + 1] = time.perf_counter()
        t_con[-1] = time.perf_counter()
        print("Generation by CONVOLUTION stop!")

    if BENCH_GPU:
        print("Generation by GPU start!")
        output_gpu = torch.zeros(
            (
                n_contours,
                y_canvas - d_gab_PIX + 1,
                x_canvas - d_gab_PIX + 1,
            ),
            device=torch_device,
        )
        t_rsg[0] = time.perf_counter()
        for i_rep in torch.arange(n_repeat):
            for i_con in torch.arange(n_contours):
                output_gpu[i_con] = render_stimulus(
                    kernels=kernels[0],
                    index_element=index_srcchg[i_con],
                    index_y=index_y[i_con],
                    index_x=index_x[i_con],
                    y_canvas=y_canvas,
                    x_canvas=x_canvas,
                    torch_device=torch_device,
                )
            # output_gpu = torch.clip(output_gpu, -1, +1)

            t_rsg[i_rep + 1] = time.perf_counter()
        t_rsg[-1] = time.perf_counter()
        print("Generation by GPU stop!")

    if BENCH_CPU:
        print("Generation by CPU start!")
        output_cpu = torch.zeros(
            (
                n_contours,
                y_canvas - d_gab_PIX + 1,
                x_canvas - d_gab_PIX + 1,
            ),
            device="cpu",
        )
        kernels_cpu = kernels.detach().cpu()
        t_rsc[0] = time.perf_counter()
        for i_rep in range(n_repeat):
            for i_con in range(n_contours):
                output_cpu[i_con] = render_stimulus(
                    kernels=kernels_cpu[0],
                    index_element=index_srcchg[i_con],
                    index_y=index_y[i_con],
                    index_x=index_x[i_con],
                    y_canvas=y_canvas,
                    x_canvas=x_canvas,
                    torch_device="cpu",
                )
            # output_cpu = torch.clip(output_cpu, -1, +1)

            t_rsc[i_rep + 1] = time.perf_counter()
        t_rsc[-1] = time.perf_counter()
        print("Generation by CPU stop!")

    if BENCH_DAVID:
        print("Generation by DAVID start!")
        from CPPExtensions.PyTCopyCPU import TCopyCPU as render_stimulus_CPP

        copyier = render_stimulus_CPP()

        number_of_cpu_processes = os.cpu_count()
        output_dav_tmp = torch.zeros(
            (
                n_contours,
                y_canvas + 2 * r_gab_PIX,
                x_canvas + 2 * r_gab_PIX,
            ),
            device="cpu",
            dtype=torch.float,
        )
        gabor = kernels[0].detach().cpu()

        # Umsort!
        n_elements_total = 0
        for i_con in range(n_contours):
            n_elements_total += len(index_x[i_con])
        sparse_matrix = torch.zeros(
            (n_elements_total, 4), device="cpu", dtype=torch.int64
        )
        i_elements_total = 0
        for i_con in range(n_contours):
            n_add = len(index_x[i_con])
            sparse_matrix[i_elements_total : i_elements_total + n_add, 0] = i_con
            sparse_matrix[
                i_elements_total : i_elements_total + n_add, 1
            ] = index_srcchg[i_con]
            sparse_matrix[i_elements_total : i_elements_total + n_add, 2] = index_y[
                i_con
            ]
            sparse_matrix[i_elements_total : i_elements_total + n_add, 3] = index_x[
                i_con
            ]
            i_elements_total += n_add
        assert i_elements_total == n_elements_total, "UNBEHAGEN macht sich breit!"

        t_dav = torch.zeros((n_repeat + 2), device="cpu")
        t_dav[0] = time.perf_counter()
        for i_rep in range(n_repeat):
            output_dav_tmp.fill_(0.0)
            copyier.process(
                sparse_matrix.data_ptr(),
                int(sparse_matrix.shape[0]),
                int(sparse_matrix.shape[1]),
                gabor.data_ptr(),
                int(gabor.shape[0]),
                int(gabor.shape[1]),
                int(gabor.shape[2]),
                output_dav_tmp.data_ptr(),
                int(output_dav_tmp.shape[0]),
                int(output_dav_tmp.shape[1]),
                int(output_dav_tmp.shape[2]),
                int(number_of_cpu_processes),
            )
            output_dav = output_dav_tmp[
                :,
                d_gab_PIX - 1 : -(d_gab_PIX - 1),
                d_gab_PIX - 1 : -(d_gab_PIX - 1),
            ].clone()
            t_dav[i_rep + 1] = time.perf_counter()
        t_dav[-1] = time.perf_counter()
        print("Generation by DAVID done!")

    if VERBOSE:  # show last stimulus
        if BENCH_CONVOLVE:
            plt.subplot(2, 2, 1)
            plt.imshow(output[-1, 0].detach().cpu(), cmap="gray", vmin=-1, vmax=+1)
            plt.title("convolve")
        if BENCH_GPU:
            plt.subplot(2, 2, 2)
            plt.imshow(output_gpu[-1].detach().cpu(), cmap="gray", vmin=-1, vmax=+1)
            plt.title("gpu")
        if BENCH_CPU:
            plt.subplot(2, 2, 3)
            plt.imshow(output_cpu[-1], cmap="gray", vmin=-1, vmax=+1)
            plt.title("cpu")
        if BENCH_DAVID:
            plt.subplot(2, 2, 4)
            plt.imshow(output_dav[-1], cmap="gray", vmin=-1, vmax=+1)
            plt.title("david")
        plt.show()

    dt_discretize = t_dis.diff() / n_contours
    plt.plot(dt_discretize.detach().cpu())
    dt_convolve = t_con.diff() / n_contours
    plt.plot(dt_convolve.detach().cpu())
    dt_gpu = t_rsg.diff() / n_contours
    plt.plot(dt_gpu.detach().cpu())
    dt_cpu = t_rsc.diff() / n_contours
    plt.plot(dt_cpu.detach().cpu())
    dt_david = t_dav.diff() / n_contours
    plt.plot(dt_david.detach().cpu())

    plt.legend(["discretize", "convolve", "gpu", "cpu", "david"])
    plt.show()
    print(
        f"Average discretize for 1k stims: {1000*dt_discretize[:-1].detach().cpu().mean()} secs."
    )
    print(
        f"Average convolve for 1k stims: {1000*dt_convolve[:-1].detach().cpu().mean()} secs."
    )
    print(f"Average gpu for 1k stims: {1000*dt_gpu[:-1].detach().cpu().mean()} secs.")
    print(f"Average cpu for 1k stims: {1000*dt_cpu[:-1].detach().cpu().mean()} secs.")
    print(
        f"Average david for 1k stims: {1000*dt_david[:-1].detach().cpu().mean()} secs."
    )

    if BENCH_GPU and BENCH_CPU and BENCH_DAVID:
        df1 = (torch.abs(output_gpu[-1].detach().cpu() - output_cpu[-1])).mean()
        df2 = (torch.abs(output_gpu[-1].detach().cpu() - output_dav[-1])).mean()
        df3 = (torch.abs(output_dav[-1].cpu() - output_cpu[-1])).mean()
        print(f"Differences: CPU-GPU:{df1}, GPU-David:{df2}, David-CPU:{df3}")

    # %%
Add files via upload 2023-07-28 15:42:20 +02:00			`# %%`
			`#`
			`# contours.py`
			`#`
			`# Tools for contour integration studies`
			`#`
			`# Version 2.0, 28.04.2023:`
			`# phases can now be randomized...`
			`#`

			`#`
			`# Coordinate system assumptions:`
			`#`
			`# for arrays:`
			`# [..., HEIGHT, WIDTH], origin is on TOP LEFT`
			`# HEIGHT indices decrease with increasing y-coordinates (reversed)`
			`# WIDTH indices increase with increasing x-coordinates (normal)`
			`#`
			`# Orientations:`
			`# 0 is horizontal, orientation increase counter-clockwise`
			`# Corner elements, quantified by [dir_source, dir_change]:`
			`# - consist of two legs`
			`# - contour enters corner from source direction at one leg`
			`# and goes from border to its center...`
			`# - contour path changes by direction change and goes`
			`# from center to the border`
			`#`

			`import torch`
			`import time`
			`import matplotlib.pyplot as plt`
			`import math`
			`import scipy.io`
			`import numpy as np`
			`import os`

			`torch_device = "cuda"`
			`default_dtype = torch.float32`
			`torch.set_default_dtype(default_dtype)`
			`torch.device(torch_device)`


			`#`
			`# performs a coordinate transform (rotation with phi around origin)`
			`# rotation is performed CLOCKWISE with increasing phi`
			`#`
			`# remark: rotating a mesh grid by phi and orienting an image element`
			`# along the new x-axis is EQUIVALENT to rotating the image element`
			`# by -phi (so this realizes a rotation COUNTER-CLOCKWISE with`
			`# increasing phi)`
			`#`
			`def rotate_CW(x: torch.Tensor, y: torch.Tensor, phi: torch.float32):`
			`xr = +x * torch.cos(phi) + y * torch.sin(phi)`
			`yr = -x * torch.sin(phi) + y * torch.cos(phi)`

			`return xr, yr`


			`#`
			`# renders a Gabor with (or without) corner`
			`#`
			`def gaborner(`
			`r_gab: int, # radius, size will be 2*r_gab+1`
			`dir_source: float, # contour enters in this dir`
			`dir_change: float, # contour turns around by this dir`
			`lambdah: float, # wavelength of Gabor`
			`sigma: float, # half-width of Gabor`
			`phase: float, # phase of Gabor`
			`normalize: bool, # normalize patch to zero`
			`torch_device: str, # GPU or CPU...`
			`) -> torch.Tensor:`
			`# incoming dir: change to outgoing dir`
			`dir1 = dir_source + torch.pi`
			`nook = dir_change - torch.pi`

			`# create coordinate grids`
			`d_gab = 2 * r_gab + 1`
			`x = -r_gab + torch.arange(d_gab, device=torch_device)`
			`yg, xg = torch.meshgrid(x, x, indexing="ij")`

			`# put into tensor for performing vectorized scalar products`
			`xyg = torch.zeros([d_gab, d_gab, 1, 2], device=torch_device)`
			`xyg[:, :, 0, 0] = xg`
			`xyg[:, :, 0, 1] = yg`

			`# create Gaussian hull`
			`gauss = torch.exp(-(xg2 + yg2) / 2 / sigma**2)`
			`gabor_corner = gauss.clone()`

			`if (dir_change == 0) or (dir_change == torch.pi):`
			`# handle special case of straight Gabor or change by 180 deg`

			`# vector orth to Gabor axis`
			`ev1_orth = torch.tensor(`
			`[math.cos(-dir1 + math.pi / 2), math.sin(-dir1 + math.pi / 2)],`
			`device=torch_device,`
			`)`
			`# project coords to orth vector to get distance`
			`legs = torch.cos(`
			`2`
			`* torch.pi`
			`* torch.matmul(xyg, ev1_orth.unsqueeze(1).unsqueeze(0).unsqueeze(0))`
			`/ lambdah`
			`+ phase`
			`)`
			`gabor_corner *= legs[:, :, 0, 0]`

			`else:`
			`dir2 = dir1 + nook`

			`# compute separation line between corner's legs`
			`ev1 = torch.tensor([math.cos(-dir1), math.sin(-dir1)], device=torch_device)`
			`ev2 = torch.tensor([math.cos(-dir2), math.sin(-dir2)], device=torch_device)`
			`v_towards_1 = (ev1 - ev2).unsqueeze(1).unsqueeze(0).unsqueeze(0)`

			`# which coords belong to which leg?`
			`which_side = torch.matmul(xyg, v_towards_1)[:, :, 0, 0]`
			`towards_1y, towards_1x = torch.where(which_side > 0)`
			`towards_2y, towards_2x = torch.where(which_side <= 0)`

			`# compute orth distance to legs`
			`side_sign = -1 + 2 * ((dir_change % 2 * torch.pi) > torch.pi)`
			`ev12 = ev1 + ev2`
			`v1_orth = ev12 - ev1 * torch.matmul(ev12, ev1)`
			`v2_orth = ev12 - ev2 * torch.matmul(ev12, ev2)`
			`ev1_orth = side_sign * v1_orth / torch.sqrt((v1_orth**2).sum())`
			`ev2_orth = side_sign * v2_orth / torch.sqrt((v2_orth**2).sum())`

			`leg1 = torch.cos(`
			`2`
			`* torch.pi`
			`* torch.matmul(xyg, ev1_orth.unsqueeze(1).unsqueeze(0).unsqueeze(0))`
			`/ lambdah`
			`+ phase`
			`)`
			`leg2 = torch.cos(`
			`2`
			`* torch.pi`
			`* torch.matmul(xyg, ev2_orth.unsqueeze(1).unsqueeze(0).unsqueeze(0))`
			`/ lambdah`
			`+ phase`
			`)`
			`gabor_corner[towards_1y, towards_1x] *= leg1[towards_1y, towards_1x, 0, 0]`
			`gabor_corner[towards_2y, towards_2x] *= leg2[towards_2y, towards_2x, 0, 0]`

			`# depending on phase, Gabor might not be normalized...`
			`if normalize:`
			`s = gabor_corner.sum()`
			`s0 = gauss.sum()`
			`gabor_corner -= s / s0 * gauss`

			`return gabor_corner`


			`#`
			`# creates a filter bank of Gabor corners`
			`#`
			`# outputs:`
			`# filters: [n_source, n_change, HEIGHT, WIDTH]`
			`# dirs_source: [n_source]`
			`# dirs_change: [n_change]`
			`#`
			`def gaborner_filterbank(`
			`r_gab: int, # radius, size will be 2*r_gab+1`
			`n_source: int, # number of source orientations`
			`n_change: int, # number of direction changes`
			`lambdah: float, # wavelength of Gabor`
			`sigma: float, # half-width of Gabor`
			`phase: float, # phase of Gabor`
			`normalize: bool, # normalize patch to zero`
			`torch_device: str, # GPU or CPU...`
			`) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:`
			`kernels = torch.zeros(`
			`[n_source, n_change, 2 * r_gab + 1, 2 * r_gab + 1],`
			`device=torch_device,`
			`requires_grad=False,`
			`)`
			`dirs_source = 2 * torch.pi * torch.arange(n_source, device=torch_device) / n_source`
			`dirs_change = 2 * torch.pi * torch.arange(n_change, device=torch_device) / n_change`

			`for i_source in range(n_source):`
			`for i_change in range(n_change):`
			`gabor_corner = gaborner(`
			`r_gab=r_gab,`
			`dir_source=dirs_source[i_source],`
			`dir_change=dirs_change[i_change],`
			`lambdah=lambdah,`
			`sigma=sigma,`
			`phase=phase,`
			`normalize=normalize,`
			`torch_device=torch_device,`
			`)`
			`kernels[i_source, i_change] = gabor_corner`

			`# check = torch.isnan(gabor_corner).sum()`
			`# if check > 0:`
			`# print(i_source, i_change, check)`
			`# kernels[i_source, i_change] = 1`

			`return kernels, dirs_source, dirs_change`


			`def discretize_stimuli(`
			`posori,`
			`x_range: tuple,`
			`y_range: tuple,`
			`scale_factor: float,`
			`r_gab_PIX: int,`
			`n_source: int,`
			`n_change: int,`
			`torch_device: str,`
			`n_phase: int = 1,`
			`) -> torch.Tensor:`
			`# check correct input size`
			`s = posori.shape`
			`assert len(s) == 2, "posori should be NDARRAY with N x 1 entries"`
			`assert s[1] == 1, "posori should be NDARRAY with N x 1 entries"`

			`# determine size of (extended) canvas`
			`x_canvas_PIX = torch.tensor(`
			`(x_range[1] - x_range[0]) * scale_factor, device=torch_device`
			`).ceil()`
			`y_canvas_PIX = torch.tensor(`
			`(y_range[1] - y_range[0]) * scale_factor, device=torch_device`
			`).ceil()`
			`x_canvas_ext_PIX = int(x_canvas_PIX + 2 * r_gab_PIX)`
			`y_canvas_ext_PIX = int(y_canvas_PIX + 2 * r_gab_PIX)`

			`# get number of contours`
			`n_contours = s[0]`
			`index_phasrcchg = []`
			`index_y = []`
			`index_x = []`
			`for i_contour in range(n_contours):`
			`x_y_src_chg = torch.asarray(posori[i_contour, 0][1:, :].copy())`
			`x_y_src_chg[2] += torch.pi`

			`# if i_contour == 0:`
			`# print(x_y_src_chg[2][:3])`

			`# compute integer coordinates and find all visible elements`
			`x = ((x_y_src_chg[0] - x_range[0]) * scale_factor + r_gab_PIX).type(torch.long)`
			`y = y_canvas_ext_PIX - (`
			`(x_y_src_chg[1] - y_range[0]) * scale_factor + r_gab_PIX`
			`).type(torch.long)`
			`i_visible = torch.where(`
			`(x >= 0) * (y >= 0) * (x < x_canvas_ext_PIX) * (y < y_canvas_ext_PIX)`
			`)[0]`

			`# compute integer (changes of) directions`
			`i_source = (`
			`((((x_y_src_chg[2]) / (2 * torch.pi)) + 1 / (2 * n_source)) % 1) * n_source`
			`).type(torch.long)`
			`i_change = (`
			`(((x_y_src_chg[3] / (2 * torch.pi)) + 1 / (2 * n_change)) % 1) * n_change`
			`).type(torch.long)`

			`i_phase = torch.randint(n_phase, i_visible.size())`
			`index_phasrcchg.append(`
			`(i_phase * n_source + i_source[i_visible]) * n_change + i_change[i_visible]`
			`)`
			`# index_change.append(i_change[i_visible])`
			`index_y.append(y[i_visible])`
			`index_x.append(x[i_visible])`

			`return (`
			`index_phasrcchg,`
			`index_x,`
			`index_y,`
			`x_canvas_ext_PIX,`
			`y_canvas_ext_PIX,`
			`)`


			`def render_stimulus(`
			`kernels, index_element, index_y, index_x, y_canvas, x_canvas, torch_device`
			`):`
			`s = kernels.shape`
			`kx = s[-1]`
			`ky = s[-2]`

			`stimulus = torch.zeros((y_canvas + ky - 1, x_canvas + kx - 1), device=torch_device)`
			`n = index_element.size()[0]`
			`for i in torch.arange(n, device=torch_device):`
			`x = index_x[i]`
			`y = index_y[i]`
			`stimulus[y : y + ky, x : x + kx] += kernels[index_element[i]]`

			`return stimulus[ky - 1 : -(ky - 1), kx - 1 : -(kx - 1)]`


			`if __name__ == "__main__":`
			`VERBOSE = True`
			`BENCH_CONVOLVE = True`
			`BENCH_GPU = True`
			`BENCH_CPU = True`
			`BENCH_DAVID = True`

			`print("Testing contour rendering speed:")`
			`print("================================")`

			`# load contours, multiplex coordinates to simulate a larger set of contours`
			`n_multiplex = 1`
			`mat = scipy.io.loadmat("z.mat")`
			`posori = np.tile(mat["z"], (n_multiplex, 1))`
			`n_contours = posori.shape[0]`
			`print(f"Processing {n_contours} contour stimuli")`

			`# how many contours to render simultaneously?`
			`n_simultaneous = 5`
			`n_simultaneous_chunks, n_remaining = divmod(n_contours, n_simultaneous)`
			`assert n_remaining == 0, "Check parameters for simultaneous contour rendering!"`

			`# repeat some times for speed testing`
			`n_repeat = 10`
			`t_dis = torch.zeros((n_repeat + 2), device=torch_device)`
			`t_con = torch.zeros((n_repeat + 2), device=torch_device)`
			`t_rsg = torch.zeros((n_repeat + 2), device=torch_device)`
			`t_rsc = torch.zeros((n_repeat + 2), device="cpu")`
			`t_rsd = torch.zeros((n_repeat + 2), device="cpu")`

			`# cutout for stimuli, and gabor parameters`
			`x_range = [140, 940]`
			`y_range = [140, 940]`
			`d_gab = 40`
			`lambdah = 12`
			`sigma = 8`
			`phase = 0.0`
			`normalize = True`

			`# scale to convert coordinates to pixel values`
			`scale_factor = 0.25`

			`# number of directions for dictionary`
			`n_source = 32`
			`n_change = 32`

			`# convert sizes to pixel units`
			`lambdah_PIX = lambdah * scale_factor`
			`sigma_PIX = sigma * scale_factor`
			`r_gab_PIX = int(d_gab * scale_factor / 2)`
			`d_gab_PIX = r_gab_PIX * 2 + 1`

			`# make filterbank`
			`kernels, dirs_source, dirs_change = gaborner_filterbank(`
			`r_gab=r_gab_PIX,`
			`n_source=n_source,`
			`n_change=n_change,`
			`lambdah=lambdah_PIX,`
			`sigma=sigma_PIX,`
			`phase=phase,`
			`normalize=normalize,`
			`torch_device=torch_device,`
			`)`
			`kernels = kernels.reshape([1, n_source * n_change, d_gab_PIX, d_gab_PIX])`
			`kernels_flip = kernels.flip(dims=(-1, -2))`

			`# define "network" and put to cuda`
			`conv = torch.nn.Conv2d(`
			`in_channels=n_source * n_change,`
			`out_channels=1,`
			`kernel_size=d_gab_PIX,`
			`stride=1,`
			`device=torch_device,`
			`)`
			`conv.weight.data = kernels_flip`

			`print("Discretizing START!!!")`
			`t_dis[0] = time.perf_counter()`
			`for i_rep in range(n_repeat):`
			`# discretize`
			`(`
			`index_srcchg,`
			`index_x,`
			`index_y,`
			`x_canvas,`
			`y_canvas,`
			`) = discretize_stimuli(`
			`posori=posori,`
			`x_range=x_range,`
			`y_range=y_range,`
			`scale_factor=scale_factor,`
			`r_gab_PIX=r_gab_PIX,`
			`n_source=n_source,`
			`n_change=n_change,`
			`torch_device=torch_device,`
			`)`
			`t_dis[i_rep + 1] = time.perf_counter()`
			`t_dis[-1] = time.perf_counter()`
			`print("Discretizing END!!!")`

			`if BENCH_CONVOLVE:`
			`print("Allocating!")`
			`stimuli = torch.zeros(`
			`[n_simultaneous, n_source * n_change, y_canvas, x_canvas],`
			`device=torch_device,`
			`requires_grad=False,`
			`)`

			`print("Generation by CONVOLUTION start!")`
			`t_con[0] = time.perf_counter()`
			`for i_rep in torch.arange(n_repeat):`
			`for i_simultaneous_chunks in torch.arange(n_simultaneous_chunks):`
			`i_ofs = i_simultaneous_chunks * n_simultaneous`

			`for i_sim in torch.arange(n_simultaneous):`
			`stimuli[`
			`i_sim,`
			`index_srcchg[i_sim + i_ofs],`
			`index_y[i_sim + i_ofs],`
			`index_x[i_sim + i_ofs],`
			`] = 1`

			`output = conv(stimuli)`

			`for i_sim in range(n_simultaneous):`
			`stimuli[`
			`i_sim,`
			`index_srcchg[i_sim + i_ofs],`
			`index_y[i_sim + i_ofs],`
			`index_x[i_sim + i_ofs],`
			`] = 0`

			`t_con[i_rep + 1] = time.perf_counter()`
			`t_con[-1] = time.perf_counter()`
			`print("Generation by CONVOLUTION stop!")`

			`if BENCH_GPU:`
			`print("Generation by GPU start!")`
			`output_gpu = torch.zeros(`
			`(`
			`n_contours,`
			`y_canvas - d_gab_PIX + 1,`
			`x_canvas - d_gab_PIX + 1,`
			`),`
			`device=torch_device,`
			`)`
			`t_rsg[0] = time.perf_counter()`
			`for i_rep in torch.arange(n_repeat):`
			`for i_con in torch.arange(n_contours):`
			`output_gpu[i_con] = render_stimulus(`
			`kernels=kernels[0],`
			`index_element=index_srcchg[i_con],`
			`index_y=index_y[i_con],`
			`index_x=index_x[i_con],`
			`y_canvas=y_canvas,`
			`x_canvas=x_canvas,`
			`torch_device=torch_device,`
			`)`
			`# output_gpu = torch.clip(output_gpu, -1, +1)`

			`t_rsg[i_rep + 1] = time.perf_counter()`
			`t_rsg[-1] = time.perf_counter()`
			`print("Generation by GPU stop!")`

			`if BENCH_CPU:`
			`print("Generation by CPU start!")`
			`output_cpu = torch.zeros(`
			`(`
			`n_contours,`
			`y_canvas - d_gab_PIX + 1,`
			`x_canvas - d_gab_PIX + 1,`
			`),`
			`device="cpu",`
			`)`
			`kernels_cpu = kernels.detach().cpu()`
			`t_rsc[0] = time.perf_counter()`
			`for i_rep in range(n_repeat):`
			`for i_con in range(n_contours):`
			`output_cpu[i_con] = render_stimulus(`
			`kernels=kernels_cpu[0],`
			`index_element=index_srcchg[i_con],`
			`index_y=index_y[i_con],`
			`index_x=index_x[i_con],`
			`y_canvas=y_canvas,`
			`x_canvas=x_canvas,`
			`torch_device="cpu",`
			`)`
			`# output_cpu = torch.clip(output_cpu, -1, +1)`

			`t_rsc[i_rep + 1] = time.perf_counter()`
			`t_rsc[-1] = time.perf_counter()`
			`print("Generation by CPU stop!")`

			`if BENCH_DAVID:`
			`print("Generation by DAVID start!")`
			`from CPPExtensions.PyTCopyCPU import TCopyCPU as render_stimulus_CPP`

			`copyier = render_stimulus_CPP()`

			`number_of_cpu_processes = os.cpu_count()`
			`output_dav_tmp = torch.zeros(`
			`(`
			`n_contours,`
			`y_canvas + 2 * r_gab_PIX,`
			`x_canvas + 2 * r_gab_PIX,`
			`),`
			`device="cpu",`
			`dtype=torch.float,`
			`)`
			`gabor = kernels[0].detach().cpu()`

			`# Umsort!`
			`n_elements_total = 0`
			`for i_con in range(n_contours):`
			`n_elements_total += len(index_x[i_con])`
			`sparse_matrix = torch.zeros(`
			`(n_elements_total, 4), device="cpu", dtype=torch.int64`
			`)`
			`i_elements_total = 0`
			`for i_con in range(n_contours):`
			`n_add = len(index_x[i_con])`
			`sparse_matrix[i_elements_total : i_elements_total + n_add, 0] = i_con`
			`sparse_matrix[`
			`i_elements_total : i_elements_total + n_add, 1`
			`] = index_srcchg[i_con]`
			`sparse_matrix[i_elements_total : i_elements_total + n_add, 2] = index_y[`
			`i_con`
			`]`
			`sparse_matrix[i_elements_total : i_elements_total + n_add, 3] = index_x[`
			`i_con`
			`]`
			`i_elements_total += n_add`
			`assert i_elements_total == n_elements_total, "UNBEHAGEN macht sich breit!"`

			`t_dav = torch.zeros((n_repeat + 2), device="cpu")`
			`t_dav[0] = time.perf_counter()`
			`for i_rep in range(n_repeat):`
			`output_dav_tmp.fill_(0.0)`
			`copyier.process(`
			`sparse_matrix.data_ptr(),`
			`int(sparse_matrix.shape[0]),`
			`int(sparse_matrix.shape[1]),`
			`gabor.data_ptr(),`
			`int(gabor.shape[0]),`
			`int(gabor.shape[1]),`
			`int(gabor.shape[2]),`
			`output_dav_tmp.data_ptr(),`
			`int(output_dav_tmp.shape[0]),`
			`int(output_dav_tmp.shape[1]),`
			`int(output_dav_tmp.shape[2]),`
			`int(number_of_cpu_processes),`
			`)`
			`output_dav = output_dav_tmp[`
			`:,`
			`d_gab_PIX - 1 : -(d_gab_PIX - 1),`
			`d_gab_PIX - 1 : -(d_gab_PIX - 1),`
			`].clone()`
			`t_dav[i_rep + 1] = time.perf_counter()`
			`t_dav[-1] = time.perf_counter()`
			`print("Generation by DAVID done!")`

			`if VERBOSE: # show last stimulus`
			`if BENCH_CONVOLVE:`
			`plt.subplot(2, 2, 1)`
			`plt.imshow(output[-1, 0].detach().cpu(), cmap="gray", vmin=-1, vmax=+1)`
			`plt.title("convolve")`
			`if BENCH_GPU:`
			`plt.subplot(2, 2, 2)`
			`plt.imshow(output_gpu[-1].detach().cpu(), cmap="gray", vmin=-1, vmax=+1)`
			`plt.title("gpu")`
			`if BENCH_CPU:`
			`plt.subplot(2, 2, 3)`
			`plt.imshow(output_cpu[-1], cmap="gray", vmin=-1, vmax=+1)`
			`plt.title("cpu")`
			`if BENCH_DAVID:`
			`plt.subplot(2, 2, 4)`
			`plt.imshow(output_dav[-1], cmap="gray", vmin=-1, vmax=+1)`
			`plt.title("david")`
			`plt.show()`

			`dt_discretize = t_dis.diff() / n_contours`
			`plt.plot(dt_discretize.detach().cpu())`
			`dt_convolve = t_con.diff() / n_contours`
			`plt.plot(dt_convolve.detach().cpu())`
			`dt_gpu = t_rsg.diff() / n_contours`
			`plt.plot(dt_gpu.detach().cpu())`
			`dt_cpu = t_rsc.diff() / n_contours`
			`plt.plot(dt_cpu.detach().cpu())`
			`dt_david = t_dav.diff() / n_contours`
			`plt.plot(dt_david.detach().cpu())`

			`plt.legend(["discretize", "convolve", "gpu", "cpu", "david"])`
			`plt.show()`
			`print(`
			`f"Average discretize for 1k stims: {1000*dt_discretize[:-1].detach().cpu().mean()} secs."`
			`)`
			`print(`
			`f"Average convolve for 1k stims: {1000*dt_convolve[:-1].detach().cpu().mean()} secs."`
			`)`
			`print(f"Average gpu for 1k stims: {1000*dt_gpu[:-1].detach().cpu().mean()} secs.")`
			`print(f"Average cpu for 1k stims: {1000*dt_cpu[:-1].detach().cpu().mean()} secs.")`
			`print(`
			`f"Average david for 1k stims: {1000*dt_david[:-1].detach().cpu().mean()} secs."`
			`)`

			`if BENCH_GPU and BENCH_CPU and BENCH_DAVID:`
			`df1 = (torch.abs(output_gpu[-1].detach().cpu() - output_cpu[-1])).mean()`
			`df2 = (torch.abs(output_gpu[-1].detach().cpu() - output_dav[-1])).mean()`
			`df3 = (torch.abs(output_dav[-1].cpu() - output_cpu[-1])).mean()`
			`print(f"Differences: CPU-GPU:{df1}, GPU-David:{df2}, David-CPU:{df3}")`

			`# %%`