/*
 * Decompiled with CFR 0.152.
 */
package boofcv.alg.filter.convolve.noborder;

import boofcv.concurrency.BoofConcurrency;
import boofcv.misc.BoofMiscOps;
import boofcv.struct.convolve.Kernel1D_F32;
import boofcv.struct.convolve.Kernel1D_F64;
import boofcv.struct.convolve.Kernel1D_S32;
import boofcv.struct.convolve.Kernel2D_F32;
import boofcv.struct.convolve.Kernel2D_F64;
import boofcv.struct.convolve.Kernel2D_S32;
import boofcv.struct.image.GrayF32;
import boofcv.struct.image.GrayF64;
import boofcv.struct.image.GrayI16;
import boofcv.struct.image.GrayI8;
import boofcv.struct.image.GrayS16;
import boofcv.struct.image.GrayS32;
import boofcv.struct.image.GrayU16;
import boofcv.struct.image.GrayU8;
import java.util.Arrays;
import org.ddogleg.struct.DogArray_I32;
import org.jetbrains.annotations.Nullable;
import pabeles.concurrency.GrowArray;

public class ConvolveImageStandard_SB_MT {
    public static void horizontal(Kernel1D_F32 kernel, GrayF32 src, GrayF32 dst) {
        float[] dataSrc = src.data;
        float[] dataDst = dst.data;
        float[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int width = src.getWidth();
        BoofConcurrency.loopFor(0, src.height, i -> {
            int j;
            int indexDst = dst.startIndex + i * dst.stride + offset;
            int jEnd = j + width - (kernelWidth - 1);
            for (j = src.startIndex + i * src.stride; j < jEnd; ++j) {
                float total = 0.0f;
                int indexSrc = j;
                for (int k = 0; k < kernelWidth; ++k) {
                    total += dataSrc[indexSrc++] * dataKer[k];
                }
                dataDst[indexDst++] = total;
            }
        });
    }

    public static void vertical(Kernel1D_F32 kernel, GrayF32 src, GrayF32 dst) {
        float[] dataSrc = src.data;
        float[] dataDst = dst.data;
        float[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor(offset, yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + imgWidth, 0.0f);
            for (int k = 0; k < kernelWidth; ++k) {
                int iStart = src.startIndex + (y - offset + k) * src.stride;
                int iEnd = iStart + imgWidth;
                int indexDst = indexDstStart;
                float kernelValue = dataKer[k];
                for (int i = iStart; i < iEnd; ++i) {
                    int n = indexDst++;
                    dataDst[n] = dataDst[n] + dataSrc[i] * kernelValue;
                }
            }
        });
    }

    public static void convolve(Kernel2D_F32 kernel, GrayF32 src, GrayF32 dest) {
        float[] dataKernel = kernel.data;
        float[] dataSrc = src.data;
        float[] dataDst = dest.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor(offsetL, height - offsetR, y -> {
            int indexDst = dest.startIndex + y * dest.stride + offsetL;
            for (int x = offsetL; x < width - offsetR; ++x) {
                float total = 0.0f;
                int indexKer = 0;
                for (int ki = 0; ki < kernel.width; ++ki) {
                    int indexSrc = src.startIndex + (y + ki - offsetL) * src.stride + x - offsetL;
                    for (int kj = 0; kj < kernel.width; ++kj) {
                        total += dataSrc[indexSrc + kj] * dataKernel[indexKer++];
                    }
                }
                dataDst[indexDst++] = total;
            }
        });
    }

    public static void horizontal(Kernel1D_F64 kernel, GrayF64 src, GrayF64 dst) {
        double[] dataSrc = src.data;
        double[] dataDst = dst.data;
        double[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int width = src.getWidth();
        BoofConcurrency.loopFor(0, src.height, i -> {
            int j;
            int indexDst = dst.startIndex + i * dst.stride + offset;
            int jEnd = j + width - (kernelWidth - 1);
            for (j = src.startIndex + i * src.stride; j < jEnd; ++j) {
                double total = 0.0;
                int indexSrc = j;
                for (int k = 0; k < kernelWidth; ++k) {
                    total += dataSrc[indexSrc++] * dataKer[k];
                }
                dataDst[indexDst++] = total;
            }
        });
    }

    public static void vertical(Kernel1D_F64 kernel, GrayF64 src, GrayF64 dst) {
        double[] dataSrc = src.data;
        double[] dataDst = dst.data;
        double[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor(offset, yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + imgWidth, 0.0);
            for (int k = 0; k < kernelWidth; ++k) {
                int iStart = src.startIndex + (y - offset + k) * src.stride;
                int iEnd = iStart + imgWidth;
                int indexDst = indexDstStart;
                double kernelValue = dataKer[k];
                for (int i = iStart; i < iEnd; ++i) {
                    int n = indexDst++;
                    dataDst[n] = dataDst[n] + dataSrc[i] * kernelValue;
                }
            }
        });
    }

    public static void convolve(Kernel2D_F64 kernel, GrayF64 src, GrayF64 dest) {
        double[] dataKernel = kernel.data;
        double[] dataSrc = src.data;
        double[] dataDst = dest.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor(offsetL, height - offsetR, y -> {
            int indexDst = dest.startIndex + y * dest.stride + offsetL;
            for (int x = offsetL; x < width - offsetR; ++x) {
                double total = 0.0;
                int indexKer = 0;
                for (int ki = 0; ki < kernel.width; ++ki) {
                    int indexSrc = src.startIndex + (y + ki - offsetL) * src.stride + x - offsetL;
                    for (int kj = 0; kj < kernel.width; ++kj) {
                        total += dataSrc[indexSrc + kj] * dataKernel[indexKer++];
                    }
                }
                dataDst[indexDst++] = total;
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, GrayU8 src, GrayI16 dst) {
        byte[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int width = src.getWidth();
        BoofConcurrency.loopFor(0, src.height, i -> {
            int j;
            int indexDst = dst.startIndex + i * dst.stride + offset;
            int jEnd = j + width - (kernelWidth - 1);
            for (j = src.startIndex + i * src.stride; j < jEnd; ++j) {
                int total = 0;
                int indexSrc = j;
                for (int k = 0; k < kernelWidth; ++k) {
                    total += (dataSrc[indexSrc++] & 0xFF) * dataKer[k];
                }
                dataDst[indexDst++] = (short)total;
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, GrayU8 src, GrayI16 dst) {
        byte[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor(offset, yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + imgWidth, (short)0);
            for (int k = 0; k < kernelWidth; ++k) {
                int iStart = src.startIndex + (y - offset + k) * src.stride;
                int iEnd = iStart + imgWidth;
                int indexDst = indexDstStart;
                int kernelValue = dataKer[k];
                for (int i = iStart; i < iEnd; ++i) {
                    int n = indexDst++;
                    dataDst[n] = (short)(dataDst[n] + (short)((dataSrc[i] & 0xFF) * kernelValue));
                }
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, GrayU8 src, GrayI16 dest) {
        int[] dataKernel = kernel.data;
        byte[] dataSrc = src.data;
        short[] dataDst = dest.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor(offsetL, height - offsetR, y -> {
            int indexDst = dest.startIndex + y * dest.stride + offsetL;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int total = 0;
                int indexKer = 0;
                for (int ki = 0; ki < kernel.width; ++ki) {
                    int indexSrc = src.startIndex + (y + ki - offsetL) * src.stride + x - offsetL;
                    for (int kj = 0; kj < kernel.width; ++kj) {
                        total += (dataSrc[indexSrc + kj] & 0xFF) * dataKernel[indexKer++];
                    }
                }
                dataDst[indexDst++] = (short)total;
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, GrayU8 src, GrayS32 dst) {
        byte[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int width = src.getWidth();
        BoofConcurrency.loopFor(0, src.height, i -> {
            int j;
            int indexDst = dst.startIndex + i * dst.stride + offset;
            int jEnd = j + width - (kernelWidth - 1);
            for (j = src.startIndex + i * src.stride; j < jEnd; ++j) {
                int total = 0;
                int indexSrc = j;
                for (int k = 0; k < kernelWidth; ++k) {
                    total += (dataSrc[indexSrc++] & 0xFF) * dataKer[k];
                }
                dataDst[indexDst++] = total;
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, GrayU8 src, GrayS32 dst) {
        byte[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor(offset, yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + imgWidth, 0);
            for (int k = 0; k < kernelWidth; ++k) {
                int iStart = src.startIndex + (y - offset + k) * src.stride;
                int iEnd = iStart + imgWidth;
                int indexDst = indexDstStart;
                int kernelValue = dataKer[k];
                for (int i = iStart; i < iEnd; ++i) {
                    int n = indexDst++;
                    dataDst[n] = dataDst[n] + (dataSrc[i] & 0xFF) * kernelValue;
                }
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, GrayU8 src, GrayS32 dest) {
        int[] dataKernel = kernel.data;
        byte[] dataSrc = src.data;
        int[] dataDst = dest.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor(offsetL, height - offsetR, y -> {
            int indexDst = dest.startIndex + y * dest.stride + offsetL;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int total = 0;
                int indexKer = 0;
                for (int ki = 0; ki < kernel.width; ++ki) {
                    int indexSrc = src.startIndex + (y + ki - offsetL) * src.stride + x - offsetL;
                    for (int kj = 0; kj < kernel.width; ++kj) {
                        total += (dataSrc[indexSrc + kj] & 0xFF) * dataKernel[indexKer++];
                    }
                }
                dataDst[indexDst++] = total;
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, GrayU16 src, GrayI8 dst, int divisor, @Nullable GrowArray<DogArray_I32> workspaces) {
        workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new);
        short[] dataSrc = src.data;
        byte[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int halfDivisor = divisor / 2;
        double divisionHack = 1.0 / (double)divisor;
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopBlocks(offset, yEnd, workspaces, (work, y0, y1) -> {
            int[] totalRow = BoofMiscOps.checkDeclare(work, imgWidth, true);
            for (int y = y0; y < y1; ++y) {
                for (int k = 0; k < kernelWidth; ++k) {
                    int kernelValue = dataKer[k];
                    int indexSrc = src.startIndex + (y - offset + k) * src.stride;
                    int i = 0;
                    while (i < imgWidth) {
                        int n = i++;
                        totalRow[n] = totalRow[n] + (dataSrc[indexSrc++] & 0xFFFF) * kernelValue;
                    }
                }
                int indexDst = dst.startIndex + y * dst.stride;
                for (int i = 0; i < imgWidth; ++i) {
                    dataDst[indexDst++] = (byte)((double)(totalRow[i] + halfDivisor) * divisionHack);
                }
                Arrays.fill(totalRow, 0, imgWidth, 0);
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, GrayS16 src, GrayI16 dst) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int width = src.getWidth();
        BoofConcurrency.loopFor(0, src.height, i -> {
            int j;
            int indexDst = dst.startIndex + i * dst.stride + offset;
            int jEnd = j + width - (kernelWidth - 1);
            for (j = src.startIndex + i * src.stride; j < jEnd; ++j) {
                int total = 0;
                int indexSrc = j;
                for (int k = 0; k < kernelWidth; ++k) {
                    total += dataSrc[indexSrc++] * dataKer[k];
                }
                dataDst[indexDst++] = (short)total;
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, GrayS16 src, GrayI16 dst) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor(offset, yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + imgWidth, (short)0);
            for (int k = 0; k < kernelWidth; ++k) {
                int iStart = src.startIndex + (y - offset + k) * src.stride;
                int iEnd = iStart + imgWidth;
                int indexDst = indexDstStart;
                int kernelValue = dataKer[k];
                for (int i = iStart; i < iEnd; ++i) {
                    int n = indexDst++;
                    dataDst[n] = (short)(dataDst[n] + (short)(dataSrc[i] * kernelValue));
                }
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, GrayS16 src, GrayI16 dest) {
        int[] dataKernel = kernel.data;
        short[] dataSrc = src.data;
        short[] dataDst = dest.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor(offsetL, height - offsetR, y -> {
            int indexDst = dest.startIndex + y * dest.stride + offsetL;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int total = 0;
                int indexKer = 0;
                for (int ki = 0; ki < kernel.width; ++ki) {
                    int indexSrc = src.startIndex + (y + ki - offsetL) * src.stride + x - offsetL;
                    for (int kj = 0; kj < kernel.width; ++kj) {
                        total += dataSrc[indexSrc + kj] * dataKernel[indexKer++];
                    }
                }
                dataDst[indexDst++] = (short)total;
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, GrayU8 src, GrayI8 dst, int divisor) {
        byte[] dataSrc = src.data;
        byte[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int halfDivisor = divisor / 2;
        int width = src.getWidth();
        BoofConcurrency.loopFor(0, src.height, i -> {
            int j;
            int indexDst = dst.startIndex + i * dst.stride + offset;
            int jEnd = j + width - (kernelWidth - 1);
            for (j = src.startIndex + i * src.stride; j < jEnd; ++j) {
                int total = 0;
                int indexSrc = j;
                for (int k = 0; k < kernelWidth; ++k) {
                    total += (dataSrc[indexSrc++] & 0xFF) * dataKer[k];
                }
                dataDst[indexDst++] = (byte)((total + halfDivisor) / divisor);
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, GrayU8 src, GrayI8 dst, int divisor, @Nullable GrowArray<DogArray_I32> workspaces) {
        workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new);
        byte[] dataSrc = src.data;
        byte[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int halfDivisor = divisor / 2;
        double divisionHack = 1.0 / (double)divisor;
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopBlocks(offset, yEnd, workspaces, (work, y0, y1) -> {
            int[] totalRow = BoofMiscOps.checkDeclare(work, imgWidth, true);
            for (int y = y0; y < y1; ++y) {
                for (int k = 0; k < kernelWidth; ++k) {
                    int kernelValue = dataKer[k];
                    int indexSrc = src.startIndex + (y - offset + k) * src.stride;
                    int i = 0;
                    while (i < imgWidth) {
                        int n = i++;
                        totalRow[n] = totalRow[n] + (dataSrc[indexSrc++] & 0xFF) * kernelValue;
                    }
                }
                int indexDst = dst.startIndex + y * dst.stride;
                for (int i = 0; i < imgWidth; ++i) {
                    dataDst[indexDst++] = (byte)((double)(totalRow[i] + halfDivisor) * divisionHack);
                }
                Arrays.fill(totalRow, 0, imgWidth, 0);
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, GrayU8 src, GrayI8 dest, int divisor, @Nullable GrowArray<DogArray_I32> workspaces) {
        workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new);
        int[] dataKernel = kernel.data;
        byte[] dataSrc = src.data;
        byte[] dataDst = dest.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int halfDivisor = divisor / 2;
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopBlocks(offsetL, height - offsetR, kernel.width, workspaces, (work, y0, y1) -> {
            int[] totalRow = BoofMiscOps.checkDeclare(work, src.width, false);
            for (int y = y0; y < y1; ++y) {
                int indexSrcRow = src.startIndex + (y - offsetL) * src.stride - offsetL;
                for (int x = offsetL; x < width - offsetR; ++x) {
                    int indexSrc = indexSrcRow + x;
                    int total = 0;
                    for (int k = 0; k < kernel.width; ++k) {
                        total += (dataSrc[indexSrc++] & 0xFF) * dataKernel[k];
                    }
                    totalRow[x] = total;
                }
                for (int i = 1; i < kernel.width; ++i) {
                    indexSrcRow = src.startIndex + (y + i - offsetL) * src.stride - offsetL;
                    int indexKer = i * kernel.width;
                    int x = offsetL;
                    while (x < width - offsetR) {
                        int indexSrc = indexSrcRow + x;
                        int total = 0;
                        for (int k = 0; k < kernel.width; ++k) {
                            total += (dataSrc[indexSrc++] & 0xFF) * dataKernel[indexKer + k];
                        }
                        int n = x++;
                        totalRow[n] = totalRow[n] + total;
                    }
                }
                int indexDst = dest.startIndex + y * dest.stride + offsetL;
                for (int x = offsetL; x < width - offsetR; ++x) {
                    dataDst[indexDst++] = (byte)((totalRow[x] + halfDivisor) / divisor);
                }
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, GrayS16 src, GrayI16 dst, int divisor) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int halfDivisor = divisor / 2;
        int width = src.getWidth();
        BoofConcurrency.loopFor(0, src.height, i -> {
            int j;
            int indexDst = dst.startIndex + i * dst.stride + offset;
            int jEnd = j + width - (kernelWidth - 1);
            for (j = src.startIndex + i * src.stride; j < jEnd; ++j) {
                int total = 0;
                int indexSrc = j;
                for (int k = 0; k < kernelWidth; ++k) {
                    total += dataSrc[indexSrc++] * dataKer[k];
                }
                dataDst[indexDst++] = (short)((total + halfDivisor) / divisor);
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, GrayS16 src, GrayI16 dst, int divisor, @Nullable GrowArray<DogArray_I32> workspaces) {
        workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new);
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int halfDivisor = divisor / 2;
        double divisionHack = 1.0 / (double)divisor;
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopBlocks(offset, yEnd, workspaces, (work, y0, y1) -> {
            int[] totalRow = BoofMiscOps.checkDeclare(work, imgWidth, true);
            for (int y = y0; y < y1; ++y) {
                for (int k = 0; k < kernelWidth; ++k) {
                    int kernelValue = dataKer[k];
                    int indexSrc = src.startIndex + (y - offset + k) * src.stride;
                    int i = 0;
                    while (i < imgWidth) {
                        int n = i++;
                        totalRow[n] = totalRow[n] + dataSrc[indexSrc++] * kernelValue;
                    }
                }
                int indexDst = dst.startIndex + y * dst.stride;
                for (int i = 0; i < imgWidth; ++i) {
                    dataDst[indexDst++] = (short)((double)(totalRow[i] + halfDivisor) * divisionHack);
                }
                Arrays.fill(totalRow, 0, imgWidth, 0);
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, GrayS16 src, GrayI16 dest, int divisor, @Nullable GrowArray<DogArray_I32> workspaces) {
        workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new);
        int[] dataKernel = kernel.data;
        short[] dataSrc = src.data;
        short[] dataDst = dest.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int halfDivisor = divisor / 2;
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopBlocks(offsetL, height - offsetR, kernel.width, workspaces, (work, y0, y1) -> {
            int[] totalRow = BoofMiscOps.checkDeclare(work, src.width, false);
            for (int y = y0; y < y1; ++y) {
                int indexSrcRow = src.startIndex + (y - offsetL) * src.stride - offsetL;
                for (int x = offsetL; x < width - offsetR; ++x) {
                    int indexSrc = indexSrcRow + x;
                    int total = 0;
                    for (int k = 0; k < kernel.width; ++k) {
                        total += dataSrc[indexSrc++] * dataKernel[k];
                    }
                    totalRow[x] = total;
                }
                for (int i = 1; i < kernel.width; ++i) {
                    indexSrcRow = src.startIndex + (y + i - offsetL) * src.stride - offsetL;
                    int indexKer = i * kernel.width;
                    int x = offsetL;
                    while (x < width - offsetR) {
                        int indexSrc = indexSrcRow + x;
                        int total = 0;
                        for (int k = 0; k < kernel.width; ++k) {
                            total += dataSrc[indexSrc++] * dataKernel[indexKer + k];
                        }
                        int n = x++;
                        totalRow[n] = totalRow[n] + total;
                    }
                }
                int indexDst = dest.startIndex + y * dest.stride + offsetL;
                for (int x = offsetL; x < width - offsetR; ++x) {
                    dataDst[indexDst++] = (short)((totalRow[x] + halfDivisor) / divisor);
                }
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, GrayU16 src, GrayI16 dst) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int width = src.getWidth();
        BoofConcurrency.loopFor(0, src.height, i -> {
            int j;
            int indexDst = dst.startIndex + i * dst.stride + offset;
            int jEnd = j + width - (kernelWidth - 1);
            for (j = src.startIndex + i * src.stride; j < jEnd; ++j) {
                int total = 0;
                int indexSrc = j;
                for (int k = 0; k < kernelWidth; ++k) {
                    total += (dataSrc[indexSrc++] & 0xFFFF) * dataKer[k];
                }
                dataDst[indexDst++] = (short)total;
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, GrayU16 src, GrayI16 dst) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor(offset, yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + imgWidth, (short)0);
            for (int k = 0; k < kernelWidth; ++k) {
                int iStart = src.startIndex + (y - offset + k) * src.stride;
                int iEnd = iStart + imgWidth;
                int indexDst = indexDstStart;
                int kernelValue = dataKer[k];
                for (int i = iStart; i < iEnd; ++i) {
                    int n = indexDst++;
                    dataDst[n] = (short)(dataDst[n] + (short)((dataSrc[i] & 0xFFFF) * kernelValue));
                }
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, GrayU16 src, GrayI16 dest) {
        int[] dataKernel = kernel.data;
        short[] dataSrc = src.data;
        short[] dataDst = dest.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor(offsetL, height - offsetR, y -> {
            int indexDst = dest.startIndex + y * dest.stride + offsetL;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int total = 0;
                int indexKer = 0;
                for (int ki = 0; ki < kernel.width; ++ki) {
                    int indexSrc = src.startIndex + (y + ki - offsetL) * src.stride + x - offsetL;
                    for (int kj = 0; kj < kernel.width; ++kj) {
                        total += (dataSrc[indexSrc + kj] & 0xFFFF) * dataKernel[indexKer++];
                    }
                }
                dataDst[indexDst++] = (short)total;
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, GrayU16 src, GrayI16 dst, int divisor) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int halfDivisor = divisor / 2;
        int width = src.getWidth();
        BoofConcurrency.loopFor(0, src.height, i -> {
            int j;
            int indexDst = dst.startIndex + i * dst.stride + offset;
            int jEnd = j + width - (kernelWidth - 1);
            for (j = src.startIndex + i * src.stride; j < jEnd; ++j) {
                int total = 0;
                int indexSrc = j;
                for (int k = 0; k < kernelWidth; ++k) {
                    total += (dataSrc[indexSrc++] & 0xFFFF) * dataKer[k];
                }
                dataDst[indexDst++] = (short)((total + halfDivisor) / divisor);
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, GrayU16 src, GrayI16 dst, int divisor, @Nullable GrowArray<DogArray_I32> workspaces) {
        workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new);
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int halfDivisor = divisor / 2;
        double divisionHack = 1.0 / (double)divisor;
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopBlocks(offset, yEnd, workspaces, (work, y0, y1) -> {
            int[] totalRow = BoofMiscOps.checkDeclare(work, imgWidth, true);
            for (int y = y0; y < y1; ++y) {
                for (int k = 0; k < kernelWidth; ++k) {
                    int kernelValue = dataKer[k];
                    int indexSrc = src.startIndex + (y - offset + k) * src.stride;
                    int i = 0;
                    while (i < imgWidth) {
                        int n = i++;
                        totalRow[n] = totalRow[n] + (dataSrc[indexSrc++] & 0xFFFF) * kernelValue;
                    }
                }
                int indexDst = dst.startIndex + y * dst.stride;
                for (int i = 0; i < imgWidth; ++i) {
                    dataDst[indexDst++] = (short)((double)(totalRow[i] + halfDivisor) * divisionHack);
                }
                Arrays.fill(totalRow, 0, imgWidth, 0);
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, GrayU16 src, GrayI16 dest, int divisor, @Nullable GrowArray<DogArray_I32> workspaces) {
        workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new);
        int[] dataKernel = kernel.data;
        short[] dataSrc = src.data;
        short[] dataDst = dest.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int halfDivisor = divisor / 2;
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopBlocks(offsetL, height - offsetR, kernel.width, workspaces, (work, y0, y1) -> {
            int[] totalRow = BoofMiscOps.checkDeclare(work, src.width, false);
            for (int y = y0; y < y1; ++y) {
                int indexSrcRow = src.startIndex + (y - offsetL) * src.stride - offsetL;
                for (int x = offsetL; x < width - offsetR; ++x) {
                    int indexSrc = indexSrcRow + x;
                    int total = 0;
                    for (int k = 0; k < kernel.width; ++k) {
                        total += (dataSrc[indexSrc++] & 0xFFFF) * dataKernel[k];
                    }
                    totalRow[x] = total;
                }
                for (int i = 1; i < kernel.width; ++i) {
                    indexSrcRow = src.startIndex + (y + i - offsetL) * src.stride - offsetL;
                    int indexKer = i * kernel.width;
                    int x = offsetL;
                    while (x < width - offsetR) {
                        int indexSrc = indexSrcRow + x;
                        int total = 0;
                        for (int k = 0; k < kernel.width; ++k) {
                            total += (dataSrc[indexSrc++] & 0xFFFF) * dataKernel[indexKer + k];
                        }
                        int n = x++;
                        totalRow[n] = totalRow[n] + total;
                    }
                }
                int indexDst = dest.startIndex + y * dest.stride + offsetL;
                for (int x = offsetL; x < width - offsetR; ++x) {
                    dataDst[indexDst++] = (short)((totalRow[x] + halfDivisor) / divisor);
                }
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, GrayS32 src, GrayI16 dst, int divisor, @Nullable GrowArray<DogArray_I32> workspaces) {
        workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new);
        int[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int halfDivisor = divisor / 2;
        double divisionHack = 1.0 / (double)divisor;
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopBlocks(offset, yEnd, workspaces, (work, y0, y1) -> {
            int[] totalRow = BoofMiscOps.checkDeclare(work, imgWidth, true);
            for (int y = y0; y < y1; ++y) {
                for (int k = 0; k < kernelWidth; ++k) {
                    int kernelValue = dataKer[k];
                    int indexSrc = src.startIndex + (y - offset + k) * src.stride;
                    int i = 0;
                    while (i < imgWidth) {
                        int n = i++;
                        totalRow[n] = totalRow[n] + dataSrc[indexSrc++] * kernelValue;
                    }
                }
                int indexDst = dst.startIndex + y * dst.stride;
                for (int i = 0; i < imgWidth; ++i) {
                    dataDst[indexDst++] = (short)((double)(totalRow[i] + halfDivisor) * divisionHack);
                }
                Arrays.fill(totalRow, 0, imgWidth, 0);
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, GrayS32 src, GrayS32 dst) {
        int[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int width = src.getWidth();
        BoofConcurrency.loopFor(0, src.height, i -> {
            int j;
            int indexDst = dst.startIndex + i * dst.stride + offset;
            int jEnd = j + width - (kernelWidth - 1);
            for (j = src.startIndex + i * src.stride; j < jEnd; ++j) {
                int total = 0;
                int indexSrc = j;
                for (int k = 0; k < kernelWidth; ++k) {
                    total += dataSrc[indexSrc++] * dataKer[k];
                }
                dataDst[indexDst++] = total;
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, GrayS32 src, GrayS32 dst) {
        int[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor(offset, yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + imgWidth, 0);
            for (int k = 0; k < kernelWidth; ++k) {
                int iStart = src.startIndex + (y - offset + k) * src.stride;
                int iEnd = iStart + imgWidth;
                int indexDst = indexDstStart;
                int kernelValue = dataKer[k];
                for (int i = iStart; i < iEnd; ++i) {
                    int n = indexDst++;
                    dataDst[n] = dataDst[n] + dataSrc[i] * kernelValue;
                }
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, GrayS32 src, GrayS32 dest) {
        int[] dataKernel = kernel.data;
        int[] dataSrc = src.data;
        int[] dataDst = dest.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor(offsetL, height - offsetR, y -> {
            int indexDst = dest.startIndex + y * dest.stride + offsetL;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int total = 0;
                int indexKer = 0;
                for (int ki = 0; ki < kernel.width; ++ki) {
                    int indexSrc = src.startIndex + (y + ki - offsetL) * src.stride + x - offsetL;
                    for (int kj = 0; kj < kernel.width; ++kj) {
                        total += dataSrc[indexSrc + kj] * dataKernel[indexKer++];
                    }
                }
                dataDst[indexDst++] = total;
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, GrayS32 src, GrayS32 dst, int divisor) {
        int[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int halfDivisor = divisor / 2;
        int width = src.getWidth();
        BoofConcurrency.loopFor(0, src.height, i -> {
            int j;
            int indexDst = dst.startIndex + i * dst.stride + offset;
            int jEnd = j + width - (kernelWidth - 1);
            for (j = src.startIndex + i * src.stride; j < jEnd; ++j) {
                int total = 0;
                int indexSrc = j;
                for (int k = 0; k < kernelWidth; ++k) {
                    total += dataSrc[indexSrc++] * dataKer[k];
                }
                dataDst[indexDst++] = (total + halfDivisor) / divisor;
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, GrayS32 src, GrayS32 dst, int divisor, @Nullable GrowArray<DogArray_I32> workspaces) {
        workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new);
        int[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int halfDivisor = divisor / 2;
        double divisionHack = 1.0 / (double)divisor;
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopBlocks(offset, yEnd, workspaces, (work, y0, y1) -> {
            int[] totalRow = BoofMiscOps.checkDeclare(work, imgWidth, true);
            for (int y = y0; y < y1; ++y) {
                for (int k = 0; k < kernelWidth; ++k) {
                    int kernelValue = dataKer[k];
                    int indexSrc = src.startIndex + (y - offset + k) * src.stride;
                    int i = 0;
                    while (i < imgWidth) {
                        int n = i++;
                        totalRow[n] = totalRow[n] + dataSrc[indexSrc++] * kernelValue;
                    }
                }
                int indexDst = dst.startIndex + y * dst.stride;
                for (int i = 0; i < imgWidth; ++i) {
                    dataDst[indexDst++] = (int)((double)(totalRow[i] + halfDivisor) * divisionHack);
                }
                Arrays.fill(totalRow, 0, imgWidth, 0);
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, GrayS32 src, GrayS32 dest, int divisor, @Nullable GrowArray<DogArray_I32> workspaces) {
        workspaces = BoofMiscOps.checkDeclare(workspaces, DogArray_I32::new);
        int[] dataKernel = kernel.data;
        int[] dataSrc = src.data;
        int[] dataDst = dest.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int halfDivisor = divisor / 2;
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopBlocks(offsetL, height - offsetR, kernel.width, workspaces, (work, y0, y1) -> {
            int[] totalRow = BoofMiscOps.checkDeclare(work, src.width, false);
            for (int y = y0; y < y1; ++y) {
                int indexSrcRow = src.startIndex + (y - offsetL) * src.stride - offsetL;
                for (int x = offsetL; x < width - offsetR; ++x) {
                    int indexSrc = indexSrcRow + x;
                    int total = 0;
                    for (int k = 0; k < kernel.width; ++k) {
                        total += dataSrc[indexSrc++] * dataKernel[k];
                    }
                    totalRow[x] = total;
                }
                for (int i = 1; i < kernel.width; ++i) {
                    indexSrcRow = src.startIndex + (y + i - offsetL) * src.stride - offsetL;
                    int indexKer = i * kernel.width;
                    int x = offsetL;
                    while (x < width - offsetR) {
                        int indexSrc = indexSrcRow + x;
                        int total = 0;
                        for (int k = 0; k < kernel.width; ++k) {
                            total += dataSrc[indexSrc++] * dataKernel[indexKer + k];
                        }
                        int n = x++;
                        totalRow[n] = totalRow[n] + total;
                    }
                }
                int indexDst = dest.startIndex + y * dest.stride + offsetL;
                for (int x = offsetL; x < width - offsetR; ++x) {
                    dataDst[indexDst++] = (totalRow[x] + halfDivisor) / divisor;
                }
            }
        });
    }
}

