Initial commit

2024-02-19 20:00:08 +01:00 · 2024-02-19 20:00:08 +01:00 · 6eda53859e
commit 6eda53859e
35 changed files with 2442 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/bin/*
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "conv/simd_utils"]
+	path = conv/simd_utils
+	url = https://github.com/JishinMaster/simd_utils.git
--- a/19
+++ b/19
@ -0,0 +1,19 @@
+Copyright (c) 2024 WeebDataHoarder, xyz2yuv Contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,11 @@
+# xyz2yuv tool
+Decode DCI XYZ' and apply conversions to your desired output colorspace. Supports Rec. 709 and Rec. 2020, with adjustable precision and gamma values.
+
+Supports AVX-512 and AVX2 targets, and a generic implementation in C and Go as well.
+
+## Dependencies
+* CGO
+* libopenjp2-dev
+* libavformat-dev
+* libavcodec-dev
+* libavutil-dev
--- a/bin/.gitkeep
+++ b/bin/.gitkeep
--- a/build.sh
+++ b/build.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+
+cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
+
+CMD=go
+
+if [[ "${GOROOT}" != "" ]]; then
+  CMD="${GOROOT}/bin/go"
+fi
+
+CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv ./
+CGO_CFLAGS="-DSIMD_PUMPS=2" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv_2pump ./
+CGO_CFLAGS="-DSIMD_PUMPS=4" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv_4pump ./
+CGO_CFLAGS="-DSIMD_PUMPS=8" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv_8pump ./
--- a/colorspace/adaptation.go
+++ b/colorspace/adaptation.go
@ -0,0 +1,53 @@
+package colorspace
+
+import "gonum.org/v1/gonum/mat"
+
+type ChromaticAdaptation mat.Dense
+
+func (a ChromaticAdaptation) AdaptXYZ(from, to Illuminant) mat.Matrix {
+	var fromM, toM *mat.VecDense
+
+	{
+		Xw, Yw, Zw := from.ToXYZ()
+		fromM = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
+	}
+
+	{
+		Xw, Yw, Zw := to.ToXYZ()
+		toM = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
+	}
+
+	var crdFrom, crdTo mat.VecDense
+
+	crdFrom.MulVec((*mat.Dense)(&a), fromM)
+	crdTo.MulVec((*mat.Dense)(&a), toM)
+
+	return mat.NewDiagDense(3, []float64{
+		crdTo.AtVec(0) / crdFrom.AtVec(0),
+		crdTo.AtVec(1) / crdFrom.AtVec(1),
+		crdTo.AtVec(2) / crdFrom.AtVec(2),
+	})
+}
+
+var (
+	ChromaticAdaptationBradford = mat.NewDense(3, 3, []float64{
+		0.8951, 0.2664, -0.1614,
+		-0.7502, 1.7135, 0.0367,
+		0.0389, -0.0685, 1.0296,
+	})
+	ChromaticAdaptationCMCCAT2000 = mat.NewDense(3, 3, []float64{
+		0.7982, 0.3389, -0.1371,
+		-0.5918, 1.5512, 0.0406,
+		0.0008, 0.0239, 0.9753,
+	})
+	ChromaticAdaptationCIECAT02 = mat.NewDense(3, 3, []float64{
+		0.7328, 0.4296, -0.1624,
+		-0.7036, 1.6975, 0.0061,
+		0.0030, 0.0136, 0.9834,
+	})
+	ChromaticAdaptationSharp = mat.NewDense(3, 3, []float64{
+		1.2694, -0.0988, -0.1706,
+		-0.8364, 1.8006, 0.0357,
+		0.0297, -0.0315, 1.0018,
+	})
+)
--- a/colorspace/chromaticity.go
+++ b/colorspace/chromaticity.go
@ -0,0 +1,145 @@
+package colorspace
+
+import "gonum.org/v1/gonum/mat"
+
+type Chromaticity struct {
+	Red   ColorCoordinate
+	Green ColorCoordinate
+	Blue  ColorCoordinate
+	White Illuminant
+}
+
+func (c Chromaticity) ConversionXYZ() (to, from *mat.Dense) {
+	var err error
+	var RGB *mat.Dense
+	var W *mat.VecDense
+
+	{
+		Xr, Yr, Zr := c.Red.ToXYZ()
+		Xg, Yg, Zg := c.Green.ToXYZ()
+		Xb, Yb, Zb := c.Blue.ToXYZ()
+
+		RGB = mat.NewDense(3, 3, []float64{
+			Xr, Xg, Xb,
+			Yr, Yg, Yb,
+			Zr, Zg, Zb,
+		})
+	}
+
+	{
+		Xw, Yw, Zw := c.White.ToXYZ()
+
+		W = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
+	}
+
+	var tmp1, rgb2xyz, xyz2rgb mat.Dense
+
+	var S mat.VecDense
+
+	if err = tmp1.Inverse(RGB); err != nil {
+		panic(err)
+	}
+
+	S.MulVec(&tmp1, W)
+
+	M := mat.NewDense(3, 3, []float64{
+		S.AtVec(0) * RGB.At(0, 0), S.AtVec(1) * RGB.At(0, 1), S.AtVec(2) * RGB.At(0, 2),
+		S.AtVec(0) * RGB.At(1, 0), S.AtVec(1) * RGB.At(1, 1), S.AtVec(2) * RGB.At(1, 2),
+		S.AtVec(0) * RGB.At(2, 0), S.AtVec(1) * RGB.At(2, 1), S.AtVec(2) * RGB.At(2, 2),
+	})
+
+	rgb2xyz.CloneFrom(M)
+
+	if err = xyz2rgb.Inverse(M); err != nil {
+		panic(err)
+	}
+
+	return &rgb2xyz, &xyz2rgb
+}
+
+/*
+func (c Chromaticity) XYZToRGB(connectionSpaceWhite Illuminant, adaptation ChromaticAdaptation) {
+	var err error
+	var RGB *mat.Dense
+	var W1, W2 *mat.VecDense
+
+	{
+		Xr, Yr, Zr := c.Red.ToXYZ()
+		Xg, Yg, Zg := c.Green.ToXYZ()
+		Xb, Yb, Zb := c.Blue.ToXYZ()
+
+		RGB = mat.NewDense(3, 3, []float64{
+			Xr, Xg, Xb,
+			Yr, Yg, Yb,
+			Zr, Zg, Zb,
+		})
+	}
+
+	{
+		Xw, Yw, Zw := c.White.ToXYZ()
+
+		W1 = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
+	}
+
+	{
+		Xw, Yw, Zw := connectionSpaceWhite.ToXYZ()
+
+		W2 = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
+	}
+
+	var tmp1, tmp2, M, Mc, M2, rgb2xyz, xyz2rgb, source_destination_whites, destination_source_whites, adapted_rgb2xyz_2, adapted_xyz2rgb_2 mat.Dense
+
+	var S, crdS, crdD, RA mat.VecDense
+
+	if err = tmp1.Inverse(RGB); err != nil {
+		panic(err)
+	}
+
+	S.MulVec(&tmp1, W1)
+
+	//TODO
+	M.Mul(&S, RGB)
+
+	rgb2xyz.CloneFrom(&M)
+
+	if err = xyz2rgb.Inverse(&M); err != nil {
+		panic(err)
+	}
+
+
+	// chromatic adaptation
+
+	crdS.MulVec((*mat.Dense)(&adaptation), W1)
+	crdD.MulVec((*mat.Dense)(&adaptation), W2)
+
+	Mt := mat.NewDiagDense(3, []float64{
+		crdD.AtVec(0) / crdS.AtVec(0),
+		crdD.AtVec(1) / crdS.AtVec(1),
+		crdD.AtVec(2) / crdS.AtVec(2),
+	})
+
+	if err = tmp1.Inverse((*mat.Dense)(&adaptation)); err != nil {
+		panic(err)
+	}
+
+	tmp2.Mul(&tmp1, Mt)
+	Mc.Mul(&tmp2, (*mat.Dense)(&adaptation))
+
+	source_destination_whites.CloneFrom(&Mc)
+
+	if err = destination_source_whites.Inverse(&Mc); err != nil {
+		panic(err)
+	}
+
+	M2.Mul(&Mc, &M)
+
+	adapted_rgb2xyz_2.CloneFrom(&M2)
+
+	if err = adapted_xyz2rgb_2.Inverse(&M2); err != nil {
+		panic(err)
+	}
+
+	RA.MulVec(&Mc, W1)
+
+}
+*/
--- a/colorspace/color.go
+++ b/colorspace/color.go
@ -0,0 +1,18 @@
+package colorspace
+
+type ColorCoordinate [2]float64
+
+func (c ColorCoordinate) X() float64 {
+	return c[0]
+}
+
+func (c ColorCoordinate) Y() float64 {
+	return c[1]
+}
+
+func (c ColorCoordinate) ToXYZ() (X, Y, Z float64) {
+	X = c[0] / c[1]
+	Y = 1.0
+	Z = (1 - c[0] - c[1]) / c[1]
+	return
+}
--- a/colorspace/illuminant.go
+++ b/colorspace/illuminant.go
@ -0,0 +1,18 @@
+package colorspace
+
+type Illuminant = ColorCoordinate
+
+// Standard Illuminants in 2 degree form
+var (
+	IlluminantD50 = Illuminant{0.34567, 0.35850}
+	IlluminantD55 = Illuminant{0.33242, 0.34743}
+
+	// IlluminantD60 P3-D60 (ACES Cinema)
+	IlluminantD60 = Illuminant{0.32168, 0.33767}
+
+	// IlluminantD63 P3-DCI (Theater)
+	IlluminantD63 = Illuminant{0.314, 0.351}
+
+	// IlluminantD65 Standard D65 for Rec. 709, Rec. 2020, sRGB and many more
+	IlluminantD65 = Illuminant{0.31271, 0.32902}
+)
--- a/colorspace/relative.go
+++ b/colorspace/relative.go
@ -0,0 +1,79 @@
+package colorspace
+
+type RelativeSystem struct {
+	Chromaticity Chromaticity
+
+	fromLinearTransfer TransferFunction
+
+	YCbCr YCbCrConverter
+}
+
+func (s RelativeSystem) FromLinear(c float64) float64 {
+	return s.fromLinearTransfer(c)
+}
+
+func NewRelativeSystem(chromaticity Chromaticity, fromLinearTransfer TransferFunction, converter YCbCrConverter) RelativeSystem {
+	return RelativeSystem{
+		Chromaticity:       chromaticity,
+		fromLinearTransfer: fromLinearTransfer,
+		YCbCr:              converter,
+	}
+}
+
+var (
+	SystemSRGB = NewRelativeSystem(Chromaticity{
+		Red:   ColorCoordinate{0.640, 0.330},
+		Green: ColorCoordinate{0.300, 0.600},
+		Blue:  ColorCoordinate{0.150, 0.060},
+		White: IlluminantD65,
+	}, CompandingSRGB, YCbCr_Rec709)
+
+	SystemRec709 = NewRelativeSystem(Chromaticity{
+		Red:   ColorCoordinate{0.640, 0.330},
+		Green: ColorCoordinate{0.300, 0.600},
+		Blue:  ColorCoordinate{0.150, 0.060},
+		White: IlluminantD65,
+	}, CompandingRec709, YCbCr_Rec709)
+
+	SystemRec709_Pure = NewRelativeSystem(Chromaticity{
+		Red:   ColorCoordinate{0.640, 0.330},
+		Green: ColorCoordinate{0.300, 0.600},
+		Blue:  ColorCoordinate{0.150, 0.060},
+		White: IlluminantD65,
+	}, PureRec709, YCbCr_Rec709)
+
+	SystemRec709_Pure22 = NewRelativeSystem(Chromaticity{
+		Red:   ColorCoordinate{0.640, 0.330},
+		Green: ColorCoordinate{0.300, 0.600},
+		Blue:  ColorCoordinate{0.150, 0.060},
+		White: IlluminantD65,
+	}, PureRec709_22, YCbCr_Rec709)
+
+	SystemRec709_Pure24 = NewRelativeSystem(Chromaticity{
+		Red:   ColorCoordinate{0.640, 0.330},
+		Green: ColorCoordinate{0.300, 0.600},
+		Blue:  ColorCoordinate{0.150, 0.060},
+		White: IlluminantD65,
+	}, PureRec709_24, YCbCr_Rec709)
+
+	SystemRec2020 = NewRelativeSystem(Chromaticity{
+		Red:   ColorCoordinate{0.708, 0.292},
+		Green: ColorCoordinate{0.170, 0.797},
+		Blue:  ColorCoordinate{0.131, 0.046},
+		White: IlluminantD65,
+	}, CompandingRec2020, YCbCr_Rec2020)
+
+	SystemRec2020_Pure = NewRelativeSystem(Chromaticity{
+		Red:   ColorCoordinate{0.708, 0.292},
+		Green: ColorCoordinate{0.170, 0.797},
+		Blue:  ColorCoordinate{0.131, 0.046},
+		White: IlluminantD65,
+	}, PureRec2020, YCbCr_Rec2020)
+
+	SystemRec2020_Pure24 = NewRelativeSystem(Chromaticity{
+		Red:   ColorCoordinate{0.708, 0.292},
+		Green: ColorCoordinate{0.170, 0.797},
+		Blue:  ColorCoordinate{0.131, 0.046},
+		White: IlluminantD65,
+	}, PureRec2020, YCbCr_Rec2020)
+)
--- a/colorspace/types.go
+++ b/colorspace/types.go
@ -0,0 +1,66 @@
+package colorspace
+
+import "math"
+
+type TransferFunction func(e float64) float64
+
+const DCINormalizationFactor = 48 / 52.37
+
+const (
+	Gamma22      = 2.2
+	Gamma24      = 2.4
+	GammaDCIXYZ  = 2.6
+	GammaSRGB    = Gamma22
+	GammaRec709  = 1 / 0.45
+	GammaRec2020 = GammaRec709
+)
+
+// https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.2020-2-201510-I!!PDF-E.pdf
+const alpha = 1.09929682680944 // 10 * Math.pow(beta, 0.55)
+const beta = 0.018053968510807
+
+var (
+	TransferFromDCIXYZ TransferFunction = func(e float64) float64 {
+		return e / DCINormalizationFactor
+	}
+	CompandingSRGB TransferFunction = func(e float64) float64 {
+		if e <= 0.0031308 {
+			return 12.92 * e
+		} else {
+			return 1.055*math.Pow(e, 1/GammaSRGB) - 0.055
+		}
+	}
+
+	// CompandingRec709 ITU-R BT.709
+	CompandingRec709 TransferFunction = func(e float64) float64 {
+		if e < beta {
+			return 4.5 * e
+		} else {
+			return alpha*math.Pow(e, 1/GammaRec709) - (alpha - 1)
+		}
+	}
+	PureRec709 TransferFunction = func(e float64) float64 {
+		return math.Pow(e, 1/GammaRec709)
+	}
+	PureRec709_22 TransferFunction = func(e float64) float64 {
+		return math.Pow(e, 1/Gamma22)
+	}
+	PureRec709_24 TransferFunction = func(e float64) float64 {
+		return math.Pow(e, 1/Gamma24)
+	}
+	CompandingRec2020 = CompandingRec709
+	PureRec2020       = PureRec709
+	PureRec2020_22    = PureRec709_22
+	PureRec2020_24    = PureRec709_24
+)
+
+type LUT []float64
+
+func NewGammaLUT(gamma float64, bits int) (lut LUT) {
+	size := (1 << bits) - 1
+	lut = make(LUT, size+1)
+	for i := 0; i <= size; i++ {
+		lut[i] = math.Pow(float64(i)/float64(size), gamma)
+	}
+	return lut
+}
--- a/colorspace/xyz.go
+++ b/colorspace/xyz.go
@ -0,0 +1,28 @@
+package colorspace
+
+import "math"
+
+// XYZSystem An absolute representation
+type XYZSystem struct {
+	gamma       float64
+	ToLinearLUT LUT
+	// ToLinear any adjustements on top of gamma
+	ToLinearTransfer TransferFunction
+}
+
+func (s XYZSystem) ToLinearFrom16(c uint16) float64 {
+	return s.ToLinearTransfer(s.ToLinearLUT[c])
+}
+
+func (s XYZSystem) ToLinear(c float64) float64 {
+	return s.ToLinearTransfer(math.Pow(c, s.gamma))
+}
+
+func NewXYZSystem(gamma float64, toLinearTransfer TransferFunction) XYZSystem {
+	return XYZSystem{
+		ToLinearLUT:      NewGammaLUT(gamma, 16),
+		ToLinearTransfer: toLinearTransfer,
+	}
+}
+
+var DCIXYZSystem = NewXYZSystem(GammaDCIXYZ, TransferFromDCIXYZ)
--- a/colorspace/yuv.go
+++ b/colorspace/yuv.go
@ -0,0 +1,35 @@
+package colorspace
+
+import "gonum.org/v1/gonum/mat"
+
+type YCbCrConverter struct {
+	Kr, Kg, Kb float64
+}
+
+func (c YCbCrConverter) ConversionRGB() (to, from *mat.Dense) {
+	const half = 1. / 2.
+	RgbToYPbPr := mat.NewDense(3, 3, []float64{
+		c.Kr, c.Kg, c.Kb,
+		-half * (c.Kr / (1 - c.Kb)), -half * (c.Kg / (1 - c.Kb)), half,
+		half, -half * (c.Kg / (1 - c.Kr)), -half * (c.Kb / (1 - c.Kr)),
+	})
+	YPbPrToRgb := mat.NewDense(3, 3, []float64{
+		1, 0, 2 - 2*c.Kr,
+		1, -(c.Kb / c.Kg) * (2 - 2*c.Kb), -(c.Kr / c.Kg) * (2 - 2*c.Kr),
+		1, 2 - 2*c.Kb, 0,
+	})
+
+	return YPbPrToRgb, RgbToYPbPr
+}
+
+func NewYCbCrConverter(kr, kg, kb float64) YCbCrConverter {
+	return YCbCrConverter{
+		Kr: kr,
+		Kg: kg,
+		Kb: kb,
+	}
+}
+
+var YCbCr_Rec709 = NewYCbCrConverter(0.2126, 0.7152, 0.0722)
+
+var YCbCr_Rec2020 = NewYCbCrConverter(0.2127, 0.6780, 0.0593)
--- a/conv/conv.c
+++ b/conv/conv.c
@ -0,0 +1,103 @@
+#include "conv.h"
+
+#include <math.h>
+#include <stdlib.h>
+
+const double uint16Max = (double)((1<<16)-1);
+const double int16MaxPlusOne = (double)(1<<15);
+
+const double* restrict xyz2rgb_g;
+const double* restrict rgb2yuv_g;
+double rgbGamma_g;
+
+
+#if USE_SIMD
+#include "conv_gen.h"
+#endif
+
+#if USE_SIMD && __AVX2__
+#include "conv_avx.h"
+
+#if USE_512_WIDE_PIPELINE && __AVX512F__ && __AVX512VL__ && __AVX512BW__
+//512-bit wide pipeline
+typed_converter_platform(x86_64, SIMD_PUMPS, float, 5, 1, load_packed_512f, store_packed_512f);
+typed_converter_platform(x86_64, SIMD_PUMPS, double, 2, 1, load_packed_512, store_packed_512);
+
+const char* DecoderInformation = "SIMD AVX-512 512-bit pipeline (2d 5f " str(SIMD_PUMPS) "pump)";
+#else
+
+#if !defined(__FMA__)
+#define __FMA__ 0
+const char* DecoderInformation = "SIMD AVX2 256-bit pipeline (1d 2f " str(SIMD_PUMPS) "pump)";
+#else
+const char* DecoderInformation = "SIMD AVX2 + FMA 256-bit pipeline (1d 2f " str(SIMD_PUMPS) "pump)";
+#endif
+
+//256-bit wide pipeline
+typed_converter_platform(x86_64, SIMD_PUMPS, float, 2, __FMA__, load_packed_256f, store_packed_256f);
+typed_converter_platform(x86_64, SIMD_PUMPS, double, 1, __FMA__, load_packed_256, store_packed_256);
+
+#endif
+
+#else
+//No ASM defined
+#define fType float
+#include "conv_generic.h"
+#undef fType
+#define fType double
+#include "conv_generic.h"
+
+#undef USE_SIMD
+#define USE_SIMD 0
+
+const char* DecoderInformation = "Generic scalar pipeline (1d 1f 1pump)";
+
+
+#endif
+
+const char* decoder_information() {
+    return DecoderInformation;
+}
+
+
+void init(const double* restrict xyz2rgb, const double* restrict rgb2yuv, double xyzGamma, double rgbGamma) {
+    xyz2rgb_g = xyz2rgb;
+    rgb2yuv_g = rgb2yuv;
+    rgbGamma_g = 1./rgbGamma;
+
+    for (int i = 0; i < (1 << XYZ_LOOKUP_TABLE_SIZE); i++) {
+        xyz12_to_linear_double[i] = pow((double)(i)/((1 << XYZ_LOOKUP_TABLE_SIZE)-1), xyzGamma);
+        xyz12_to_linear_float[i] = xyz12_to_linear_double[i];
+    }
+
+    #if USE_SIMD
+        _load_matrix_double(xyz2rgb_mat_double, xyz2rgb);
+        _load_matrix_float(xyz2rgb_mat_float, xyz2rgb);
+        _load_matrix_double(rgb2yuv_mat_double, rgb2yuv);
+        _load_matrix_float(rgb2yuv_mat_float, rgb2yuv);
+    #endif
+}
+
+void convert_frame_dci_xyz12_to_yuv16_float(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) {
+    for (int i = 0; i < height; i++){
+        convert_line_dci_xyz12_to_yuv16_float(in, luma, cb, cr, width, height);
+
+        in += width*3;
+
+        luma += width;
+        cb += width;
+        cr += width;
+    }
+}
+
+void convert_frame_dci_xyz12_to_yuv16_double(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) {
+    for (int i = 0; i < height; i++){
+        convert_line_dci_xyz12_to_yuv16_double(in, luma, cb, cr, width, height);
+
+        in += width*3;
+
+        luma += width;
+        cb += width;
+        cr += width;
+    }
+}
--- a/conv/conv.go
+++ b/conv/conv.go
@ -0,0 +1,127 @@
+package conv
+
+import (
+	"gonum.org/v1/gonum/mat"
+	"runtime"
+	"unsafe"
+)
+
+/*
+#cgo CFLAGS: -march=native -mtune=native -Ofast -std=c99
+#cgo LDFLAGS: -lm
+#include "conv.h"
+*/
+import "C"
+
+var matPinner runtime.Pinner
+
+func DecoderInformation() string {
+	return C.GoString(C.decoder_information())
+}
+
+func InitData(xyz2rgb, rgb2yuv *mat.Dense, xyzGamma, rgbGamma float64) {
+	a := unsafe.Pointer(unsafe.SliceData(xyz2rgb.RawMatrix().Data))
+	b := unsafe.Pointer(unsafe.SliceData(rgb2yuv.RawMatrix().Data))
+	matPinner.Pin(a)
+	matPinner.Pin(b)
+
+	C.init((*C.double)(a), (*C.double)(b), C.double(xyzGamma), C.double(rgbGamma))
+}
+
+func ConvertFrameDCIXYZToYUV16Double(in []uint32, luma, cb, cr []uint16, width, height int) {
+
+	var pinner runtime.Pinner
+
+	inp := unsafe.Pointer(unsafe.SliceData(in))
+	lumap := unsafe.Pointer(unsafe.SliceData(luma))
+	cbp := unsafe.Pointer(unsafe.SliceData(cb))
+	crp := unsafe.Pointer(unsafe.SliceData(cr))
+
+	pinner.Pin(inp)
+	pinner.Pin(lumap)
+	pinner.Pin(cbp)
+	pinner.Pin(crp)
+	defer pinner.Unpin()
+
+	C.convert_frame_dci_xyz12_to_yuv16_double(
+		(*C.uint)(inp),
+		(*C.ushort)(lumap),
+		(*C.ushort)(cbp),
+		(*C.ushort)(crp),
+		C.int(width),
+		C.int(height),
+	)
+}
+
+func ConvertLineDCIXYZToYUV16Double(in []uint32, luma, cb, cr []uint16, width, height int) {
+	var pinner runtime.Pinner
+
+	inp := unsafe.Pointer(unsafe.SliceData(in))
+	lumap := unsafe.Pointer(unsafe.SliceData(luma))
+	cbp := unsafe.Pointer(unsafe.SliceData(cb))
+	crp := unsafe.Pointer(unsafe.SliceData(cr))
+
+	pinner.Pin(inp)
+	pinner.Pin(lumap)
+	pinner.Pin(cbp)
+	pinner.Pin(crp)
+	defer pinner.Unpin()
+
+	C.convert_line_dci_xyz12_to_yuv16_double(
+		(*C.uint)(inp),
+		(*C.ushort)(lumap),
+		(*C.ushort)(cbp),
+		(*C.ushort)(crp),
+		C.int(width),
+		C.int(height),
+	)
+}
+
+func ConvertFrameDCIXYZToYUV16Float(in []uint32, luma, cb, cr []uint16, width, height int) {
+
+	var pinner runtime.Pinner
+
+	inp := unsafe.Pointer(unsafe.SliceData(in))
+	lumap := unsafe.Pointer(unsafe.SliceData(luma))
+	cbp := unsafe.Pointer(unsafe.SliceData(cb))
+	crp := unsafe.Pointer(unsafe.SliceData(cr))
+
+	pinner.Pin(inp)
+	pinner.Pin(lumap)
+	pinner.Pin(cbp)
+	pinner.Pin(crp)
+	defer pinner.Unpin()
+
+	C.convert_frame_dci_xyz12_to_yuv16_float(
+		(*C.uint)(inp),
+		(*C.ushort)(lumap),
+		(*C.ushort)(cbp),
+		(*C.ushort)(crp),
+		C.int(width),
+		C.int(height),
+	)
+}
+
+func ConvertLineDCIXYZToYUV16Float(in []uint32, luma, cb, cr []uint16, width, height int) {
+	var pinner runtime.Pinner
+
+	inp := unsafe.Pointer(unsafe.SliceData(in))
+	lumap := unsafe.Pointer(unsafe.SliceData(luma))
+	cbp := unsafe.Pointer(unsafe.SliceData(cb))
+	crp := unsafe.Pointer(unsafe.SliceData(cr))
+
+	pinner.Pin(inp)
+	pinner.Pin(lumap)
+	pinner.Pin(cbp)
+	pinner.Pin(crp)
+	defer pinner.Unpin()
+
+	C.convert_line_dci_xyz12_to_yuv16_float(
+		(*C.uint)(inp),
+		(*C.ushort)(lumap),
+		(*C.ushort)(cbp),
+		(*C.ushort)(crp),
+		C.int(width),
+		C.int(height),
+	)
+}
--- a/conv/conv.h
+++ b/conv/conv.h
@ -0,0 +1,50 @@
+#include <stdint.h>
+
+void init(const double* restrict xyz2rgb, const double* restrict rgb2yuv, double xyzGamma, double rgbGamma);
+
+
+// Use available SIMD. Disable to enforce generic pipeline.
+#if !defined(USE_SIMD)
+#define USE_SIMD 1
+#endif
+
+// Opportunistically use AVX512 features even in 256-bit mode
+#if !defined(USE_OPPORTUNISTIC_AVX512)
+#define USE_OPPORTUNISTIC_AVX512 1
+#endif
+
+//AVX2
+// double layout aaa0
+// float layout aaabbb00
+
+// Enable usage of 512-bit wide pipeline, pumping two pixels every iteration, if supported.
+// Uses AVX-512 features. Requires AVX-512 F, VL, BW. Layout aaabbb00
+// double layout aaabbb00
+// float layout aaabbbcccdddeee0
+#if !defined(USE_512_WIDE_PIPELINE)
+#define USE_512_WIDE_PIPELINE 1
+#endif
+
+// Sets the number of pumps per iteration on pipeline. Supported 1, 2, 4, 8
+// Set this if your architecture has large amount of executors than normal. Recommended to stay at 2 or 4.
+#if !defined(SIMD_PUMPS)
+#define SIMD_PUMPS 1
+#endif
+
+// ExpandLoad or CompressStore is slower than currently doing set(vals...). todo: inspect with newer CPUs than ZEN4
+#if !defined(USE_AVX512_EXPANDLOAD)
+#define USE_AVX512_EXPANDLOAD 0
+#endif
+
+// Size of the lookup table. Only valid value is 12.
+#if !defined(XYZ_LOOKUP_TABLE_SIZE)
+#define XYZ_LOOKUP_TABLE_SIZE 12
+#endif
+
+// This function can run out of bounds slightly, about 24 bytes per pump per line (XYZ) or 8 bytes per pump per line (YUV), caller should allocates extra input/output buffer for this reason.
+void convert_line_dci_xyz12_to_yuv16_float(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
+void convert_frame_dci_xyz12_to_yuv16_float(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
+void convert_line_dci_xyz12_to_yuv16_double(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
+void convert_frame_dci_xyz12_to_yuv16_double(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
+
+const char* decoder_information();
--- a/conv/conv_avx.h
+++ b/conv/conv_avx.h
@ -0,0 +1,271 @@
+#include <immintrin.h>
+#include <stdint.h>
+
+// x86_64 AVX2 and AVX512 definitions
+
+
+
+#define _perm_component_i(i, c, v) ((i*v+c)*2)
+#define _perm_component2(c) _perm_component_i(1, c, 4), _perm_component_i(0, c, 4)
+#define _perm_component5(c) _perm_component_i(4, c, 3), _perm_component_i(3, c, 3), _perm_component_i(2, c, 3), _perm_component_i(1, c, 3), _perm_component_i(0, c, 3)
+
+#if USE_512_WIDE_PIPELINE && __AVX512F__ && __AVX512VL__ && __AVX512BW__ && __AVX512DQ__
+
+__m256i load_packed_512(int j, const uint32_t* restrict in) {
+#if USE_AVX512_EXPANDLOAD
+    //todo: this path is slower than _mm256_set_epi32, both mask and maskz ZEN4
+    return _mm256_maskz_expandloadu_epi32(_cvtu32_mask8(0b01110111), in+j*3);
+#else
+    return _mm256_permutexvar_epi32(_mm256_set_epi32(7, 5, 4, 3, 7, 2, 1, 0), _mm256_maskz_loadu_epi32(_cvtu32_mask8(0b00111111), in+j*3));
+#endif
+}
+
+void store_packed_512(__m256i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
+    #define _perm_idx512 _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _perm_component2(2), _perm_component2(1), _perm_component2(0))
+
+    __m128i store = _mm256_castsi256_si128(_mm256_permutexvar_epi16(_perm_idx512, packed));
+    _mm_mask_storeu_epi16(&luma[j], _cvtu32_mask8(0b00000011), store);
+    _mm_mask_storeu_epi16(&cb[j-2], _cvtu32_mask8(0b00001100), store);
+    _mm_mask_storeu_epi16(&cr[j-4], _cvtu32_mask8(0b00110000), store);
+}
+
+
+__m512i load_packed_512f(int j, const uint32_t* restrict in) {
+    return _mm512_maskz_loadu_epi32(_cvtu32_mask16(0b0111111111111111), in+j*3);
+}
+
+void store_packed_512f(__m512i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
+    #define _perm_idx512f _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _perm_component5(2), _perm_component5(1), _perm_component5(0))
+
+    __m256i store = _mm512_castsi512_si256(_mm512_permutexvar_epi16(_perm_idx512f, packed));
+    _mm256_mask_storeu_epi16(&luma[j ], _cvtu32_mask16(0b0000000000011111), store);
+    _mm256_mask_storeu_epi16(&cb[j -5], _cvtu32_mask16(0b0000001111100000), store);
+    _mm256_mask_storeu_epi16(&cr[j-10], _cvtu32_mask16(0b0111110000000000), store);
+}
+
+#define SSE 1
+#define AVX 1
+#define AVX512 1
+#include "simd_utils/simd_utils.h"
+
+static inline __m512d pow512_pd1(__m512d x, double y1) {
+    const __m512d y = _mm512_set1_pd(y1);
+    return exp512_pd(_mm512_mul_pd(y, log512_pd(x)));
+}
+
+static inline __m512 pow512_ps1(__m512 x, double y1) {
+    const __m512d y = _mm512_set1_pd(y1);
+    //low precision cause issues, use doubles
+    __m256 a = _mm512_cvtpd_ps(exp512_pd(_mm512_mul_pd(y, log512_pd(_mm512_cvtps_pd(_mm512_castps512_ps256(x))))));
+    __m256 b = _mm512_cvtpd_ps(exp512_pd(_mm512_mul_pd(y, log512_pd(_mm512_cvtps_pd(_mm512_extractf32x8_ps(x, 1))))));
+
+    return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
+}
+
+#else
+
+__m128i load_packed_256(int j, const uint32_t* restrict in) {
+#if USE_OPPORTUNISTIC_AVX512 && __AVX512F__ && __AVX512VL__ && __AVX512DQ__
+    return _mm_maskz_loadu_epi32(_cvtu32_mask8(0b00000111), in+j*3);
+#else
+    return _mm_loadu_epi32(in+j*3);
+    //return _mm_set_epi32(0, inz[j], iny[j], inx[j]);
+#endif
+}
+
+void store_packed_256(__m128i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
+    luma[j] = ((int32_t*)(&packed))[0];
+    cb[j] = ((int32_t*)(&packed))[1];
+    cr[j] = ((int32_t*)(&packed))[2];
+}
+
+/*#if USE_DOT_PRODUCT
+//position entries properly in vectors ready for dot product
+m[0] = _mm256_set_ps(0, in[2], in[1], in[0], 0, in[2], in[1], in[0]); //0rrr 0rrr
+m[1] = _mm256_set_ps(0, in[5], in[4], in[3], 0, in[5], in[4], in[3]); //0ggg 0ggg
+m[2] = _mm256_set_ps(0, in[8], in[7], in[6], 0, in[8], in[7], in[6]); //0bbb 0bbb
+
+    __m256 a = _mm256_dp_ps(v, m0, 0b01110001);
+    __m256 b = _mm256_dp_ps(v, m1, 0b01110010);
+    __m256 c = _mm256_dp_ps(v, m2, 0b01110100);
+    return _mm256_blend_ps(_mm256_blend_ps(a, b, 0b00100010), c, 0b01000100);
+*/
+
+__m256i load_packed_256f(int j, const uint32_t* restrict in) {
+#if USE_OPPORTUNISTIC_AVX512 && USE_AVX512_EXPANDLOAD && __AVX512VBMI2__ && __AVX512VL__ && __AVX512F__
+    //todo: slow ZEN4
+    return _mm256_maskz_expandloadu_epi32(_cvtu32_mask8(0b01110111), in+j*3);
+#else
+    #if USE_OPPORTUNISTIC_AVX512 && __AVX512F__ && __AVX512BW__ && __AVX512VL__ && __AVX512DQ__
+    return _mm256_permutexvar_epi32(_mm256_set_epi32(7, 5, 4, 3, 7, 2, 1, 0), _mm256_maskz_loadu_epi32(_cvtu32_mask8(0b00111111), in+j*3));
+    #else
+    return _mm256_set_epi32(0, in[j+5], in[j+4], in[j+3], 0, in[j+2], in[j+1], in[j]);
+    #endif
+#endif
+}
+
+void store_packed_256f(__m256i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
+#if USE_OPPORTUNISTIC_AVX512 && __AVX512BW__ && __AVX512VL__ && __AVX512DQ__
+
+#define _perm_idx256 _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _perm_component2(2), _perm_component2(1), _perm_component2(0))
+
+__m128i store = _mm256_castsi256_si128(_mm256_permutexvar_epi16(_perm_idx256, packed));
+_mm_mask_storeu_epi16(&luma[j], _cvtu32_mask8(0b00000011), store);
+_mm_mask_storeu_epi16(&cb[j-2], _cvtu32_mask8(0b00001100), store);
+_mm_mask_storeu_epi16(&cr[j-4], _cvtu32_mask8(0b00110000), store);
+
+#else
+    luma[j/3] = ((int32_t*)(&packed))[0];
+    cb[j/3] = ((int32_t*)(&packed))[1];
+    cr[j/3] = ((int32_t*)(&packed))[2];
+
+    luma[j/3+1] = ((int32_t*)(&packed))[4];
+    cb[j/3+1] = ((int32_t*)(&packed))[5];
+    cr[j/3+1] = ((int32_t*)(&packed))[6];
+
+#endif
+}
+
+#define SSE 1
+#define AVX 1
+#include "simd_utils/simd_utils.h"
+
+static inline __m256d pow256_pd1(__m256d x, double y1) {
+    const __m256d y = _mm256_set1_pd(y1);
+    return exp256_pd(_mm256_mul_pd(y, log256_pd(x)));
+}
+
+static inline __m256 pow256_ps1(__m256 x, double y1) {
+    const __m256d y = _mm256_set1_pd(y1);
+    //low precision cause issues, use doubles
+    __m128 a = _mm256_cvtpd_ps(exp256_pd(_mm256_mul_pd(y, log256_pd(_mm256_cvtps_pd(_mm256_castps256_ps128(x))))));
+    __m128 b = _mm256_cvtpd_ps(exp256_pd(_mm256_mul_pd(y, log256_pd(_mm256_cvtps_pd(_mm256_extractf128_ps(x, 1))))));
+
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
+}
+
+#endif
+
+
+
+
+#define typed_vector_size_x86_64_float_5 512
+#define typed_vector_size_x86_64_double_2 512
+#define typed_vector_size_x86_64_float_2 256
+#define typed_vector_size_x86_64_double_1 256
+
+#define typed_permute_lanes_x86_64_float_5 _permutexvar_
+#define typed_permute_lanes_x86_64_double_2 _permutex_
+#define typed_permute_lanes_x86_64_float_2 _permute_
+#define typed_permute_lanes_x86_64_double_1 _permute4x64_
+
+
+inline __attribute__((always_inline)) __m512 _int_i32gather_ps(void const* base_addr, __m512i vindex, int scale) {
+    switch(scale){
+        case 1:
+        return _mm512_i32gather_ps(vindex, base_addr, 1);
+        case 2:
+        return _mm512_i32gather_ps(vindex, base_addr, 2);
+        case 4:
+        return _mm512_i32gather_ps(vindex, base_addr, 4);
+        case 8:
+        return _mm512_i32gather_ps(vindex, base_addr, 8);
+        default:
+        __builtin_unreachable();
+    }
+}
+
+inline __attribute__((always_inline)) __m512d _int_i32gather_pd(void const* base_addr, __m256i vindex, int scale) {
+    switch(scale){
+        case 1:
+        return _mm512_i32gather_pd(vindex, base_addr, 1);
+        case 2:
+        return _mm512_i32gather_pd(vindex, base_addr, 2);
+        case 4:
+        return _mm512_i32gather_pd(vindex, base_addr, 4);
+        case 8:
+        return _mm512_i32gather_pd(vindex, base_addr, 8);
+        default:
+        __builtin_unreachable();
+    }
+}
+
+#define typed_pow1_x86_64_float_5 pow512_ps1
+#define typed_pow1_x86_64_double_2 pow512_pd1
+#define typed_pow1_x86_64_float_2 pow256_ps1
+#define typed_pow1_x86_64_double_1 pow256_pd1
+#define typed_pow1_x86_64(floatType, elementCount) typed_pow1_x86_64_##floatType##_##elementCount
+
+#define typed_i32gather_x86_64_float_5 _int_i32gather_ps
+#define typed_i32gather_x86_64_double_2 _int_i32gather_pd
+#define typed_i32gather_x86_64_float_2 _mm256_i32gather_ps
+#define typed_i32gather_x86_64_double_1 _mm256_i32gather_pd
+#define typed_i32gather_x86_64(floatType, elementCount) typed_i32gather_x86_64_##floatType##_##elementCount
+
+#define typed_vector_func_prefix_x86_64 _mm
+#define typed_vector_type_prefix_x86_64 __m
+#define typed_vector_type_suffix_x86_64_double d
+#define typed_vector_type_suffix_x86_64_int i
+
+#define typed_vector_int_size_x86_64_float_5 512
+#define typed_vector_int_func_x86_64_float_5 concat(typed_vector_func_prefix_x86_64, typed_vector_int_size_x86_64_float_5)
+#define typed_vector_int_size_x86_64_double_2 256
+#define typed_vector_int_func_x86_64_double_2 concat(typed_vector_func_prefix_x86_64, typed_vector_int_size_x86_64_double_2)
+#define typed_vector_int_size_x86_64_float_2 256
+#define typed_vector_int_func_x86_64_float_2 concat(typed_vector_func_prefix_x86_64, typed_vector_int_size_x86_64_float_2)
+#define typed_vector_int_size_x86_64_double_1 128
+#define typed_vector_int_func_x86_64_double_1 typed_vector_func_prefix_x86_64
+
+#define typed_vector_func_type_x86_64_float ps
+#define typed_vector_func_type_x86_64_double pd
+
+#define typed_vector_type_x86_64_float(elementCount) concat(typed_vector_type_prefix_x86_64, typed_vector_size_x86_64_float_##elementCount)
+#define typed_vector_type_x86_64_double(elementCount) concat3(typed_vector_type_prefix_x86_64, typed_vector_size_x86_64_double_##elementCount, typed_vector_type_suffix_x86_64_double)
+
+#define typed_vector_type_x86_64(floatType, elementCount) typed_vector_type_x86_64_##floatType(elementCount)
+
+#define typed_vector_int_type_x86_64(floatType, elementCount) concat3(typed_vector_type_prefix_x86_64, typed_vector_int_size_x86_64_##floatType##_##elementCount, typed_vector_type_suffix_x86_64_int)
+
+#define typed_vector_func_x86_64_float(elementCount) concat(typed_vector_func_prefix_x86_64, typed_vector_size_x86_64_float_##elementCount)
+#define typed_vector_func_x86_64_double(elementCount) concat(typed_vector_func_prefix_x86_64, typed_vector_size_x86_64_double_##elementCount)
+
+#define typed_vector_func_x86_64(floatType, elementCount) typed_vector_func_x86_64_##floatType(elementCount)
+
+
+
+#define typed_permute_lanes_x86_64_float_5 _permutexvar_
+#define typed_permute_lanes_x86_64_double_2 _permutex_
+#define typed_permute_lanes_x86_64_float_2 _permute_
+#define typed_permute_lanes_x86_64_double_1 _permute4x64_
+
+#define typed_fmadd_x86_64 _fmadd_
+#define typed_add_x86_64 _add_
+#define typed_mul_x86_64 _mul_
+#define typed_set1_x86_64 _set1_
+#define typed_set_x86_64 _set_
+#define typed_min_x86_64 _min_
+#define typed_max_x86_64 _max_
+#define typed_seti_x86_64 _set_epi32
+#define typed_addi_x86_64 _add_epi32
+#define typed_cvt_x86_64_float _cvtps_epi32
+#define typed_cvt_x86_64_double _cvtpd_epi32
+
+#define typed_func_x86_64_fmadd(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_fmadd_x86_64, typed_vector_func_type_x86_64_##floatType)
+#define typed_func_x86_64_add(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_add_x86_64, typed_vector_func_type_x86_64_##floatType)
+#define typed_func_x86_64_mul(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_mul_x86_64, typed_vector_func_type_x86_64_##floatType)
+#define typed_func_x86_64_set1(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_set1_x86_64, typed_vector_func_type_x86_64_##floatType)
+#define typed_func_x86_64_set(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_set_x86_64, typed_vector_func_type_x86_64_##floatType)
+#define typed_func_x86_64_min(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_min_x86_64, typed_vector_func_type_x86_64_##floatType)
+#define typed_func_x86_64_max(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_max_x86_64, typed_vector_func_type_x86_64_##floatType)
+#define typed_func_x86_64_ftoi(floatType, elementCount) concat(typed_vector_func_x86_64(floatType, elementCount), typed_cvt_x86_64_##floatType)
+#define typed_func_x86_64_seti(floatType, elementCount) concat(typed_vector_int_func_x86_64_##floatType##_##elementCount, typed_seti_x86_64)
+#define typed_func_x86_64_addi(floatType, elementCount) concat(typed_vector_int_func_x86_64_##floatType##_##elementCount, typed_addi_x86_64)
+
+#define typed_func_x86_64_pow1(floatType, elementCount) typed_pow1_x86_64(floatType, elementCount)
+#define typed_func_x86_64_i32gather(floatType, elementCount) typed_i32gather_x86_64(floatType, elementCount)
+#define typed_func_x86_64_permute_lanes(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_permute_lanes_x86_64_##floatType##_##elementCount, typed_vector_func_type_x86_64_##floatType)
+
+#define typed_permute_lanes_x86_64_float_5 _permutexvar_
+#define typed_permute_lanes_x86_64_double_2 _permutex_
+#define typed_permute_lanes_x86_64_float_2 _permute_
+#define typed_permute_lanes_x86_64_double_1 _permute4x64_
--- a/conv/conv_gen.h
+++ b/conv/conv_gen.h
@ -0,0 +1,420 @@
+
+
+
+#define _lit(s) s
+#define lit(s) _lit(s)
+#define _str(s) #s
+#define str(s) _str(s)
+#define _concat(a,b) a##b
+#define concat(a,b) _concat(a,b)
+#define _concat3(a,b,c) a##b##c
+#define concat3(a,b,c) _concat3(a,b,c)
+
+
+#if XYZ_LOOKUP_TABLE_SIZE==12
+#define typed_gather_double(i32gather, packed) i32gather(xyz12_to_linear_double, packed, 8)
+#define typed_gather_float(i32gather, packed) i32gather(xyz12_to_linear_float, packed, 4)
+#else
+#error "Not supported"
+#endif
+
+#define _shuf_lane_step1(l) 3*l, 3*l+2, 3*l+1
+#define _shuf_lane_step2(l) 3*l+1, 3*l, 3*l+2
+#define _shuf_idx(seti, step) seti(0xf, _shuf_lane_step##step(4), _shuf_lane_step##step(3), _shuf_lane_step##step(2), _shuf_lane_step##step(1), _shuf_lane_step##step(0))
+
+// xxxxxyyyyyzzzzz0
+
+#define typed_mul_vec_dot_5_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
+    fmadd( \
+        permuteLanesVar(_shuf_idx(seti, 2), v), \
+        m2, \
+        fmadd( \
+           permuteLanesVar(_shuf_idx(seti, 1), v), \
+           m1, \
+           mul(v, m0) \
+        ) \
+    )
+
+#define typed_mul_vec_dot_2_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
+    fmadd( \
+        permuteLanes(v, 0b11010010), \
+        m2, \
+        fmadd( \
+            permuteLanes(v, 0b11001001), \
+            m1, \
+            mul(v, m0) \
+        ) \
+    )
+
+#define typed_mul_vec_dot_2_fma_0(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
+    add( \
+        add( \
+            mul(v, m0), \
+            mul(permuteLanes(v, 0b11001001), m1) \
+        ), \
+        mul(permuteLanes(v, 0b11010010), m2) \
+    )
+
+#define typed_mul_vec_dot_1_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
+typed_mul_vec_dot_2_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v)
+
+#define typed_mul_vec_dot_1_fma_0(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
+typed_mul_vec_dot_2_fma_0(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v)
+
+
+#define typed_mul_vec_dot_1(fma, fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
+typed_mul_vec_dot_1_fma_##fma(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v)
+
+#define typed_mul_vec_dot_2(fma, fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
+typed_mul_vec_dot_2_fma_##fma(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v)
+
+#define typed_mul_vec_dot_5(fma, fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
+typed_mul_vec_dot_5_fma_##fma(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v)
+
+#define typed_yuv_add_5(seti) \
+seti(0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0)
+#define typed_yuv_add_2(seti) \
+seti(0, int16MaxPlusOne, int16MaxPlusOne, 0, 0, int16MaxPlusOne, int16MaxPlusOne, 0)
+#define typed_yuv_add_1(seti) \
+seti(0, int16MaxPlusOne, int16MaxPlusOne, 0)
+
+
+#define typed_mat_load_func_5(setf, m, in) \
+m[0] = setf(0, in[8], in[4], in[0], in[8], in[4], in[0], in[8], in[4], in[0], in[8], in[4], in[0], in[8], in[4], in[0]); /*0 bgr bgr bgr bgr bgr*/ \
+m[1] = setf(0, in[6], in[5], in[1], in[6], in[5], in[1], in[6], in[5], in[1], in[6], in[5], in[1], in[6], in[5], in[1]); /*0 rbg rbg rbg rbg rbg*/ \
+m[2] = setf(0, in[7], in[3], in[2], in[7], in[3], in[2], in[7], in[3], in[2], in[7], in[3], in[2], in[7], in[3], in[2]); /*0 grb grb grb grb grb*/
+
+#define typed_mat_load_func_2(setf, m, in) \
+m[0] = setf(0, in[8], in[4], in[0], 0, in[8], in[4], in[0]); /* 0bgr 0bgr*/ \
+m[1] = setf(0, in[6], in[5], in[1], 0, in[6], in[5], in[1]); /* 0rbg 0rbg*/ \
+m[2] = setf(0, in[7], in[3], in[2], 0, in[7], in[3], in[2]); /* 0grb 0grb*/
+
+#define typed_mat_load_func_1(setf, m, in) \
+m[0] = setf(0, in[8], in[4], in[0]); /* 0bgr*/ \
+m[1] = setf(0, in[6], in[5], in[1]); /* 0rbg*/ \
+m[2] = setf(0, in[7], in[3], in[2]); /* 0grb*/
+
+// todo: optimize pow via https://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent ?
+
+#define _load_matrix_func _load_matrix_
+#define _line_func convert_line_dci_xyz12_to_yuv16_
+//concat(_line_func, floatType)
+
+#define typed_converter_pump1(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
+fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
+fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
+floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
+\
+void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
+    typed_mat_load_func_##elementCount(setf, m, in) \
+}; \
+\
+\
+inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
+    iPackedType packed; \
+    fVectorType xyz; \
+    fVectorType rgb; \
+    fVectorType yuv; \
+    \
+    const fVectorType minValue = set1f(1.); \
+    const fVectorType maxValue = set1f(0.); \
+    const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
+    \
+    const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
+    const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
+    const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
+    \
+    const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
+    const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
+    const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
+    \
+    for (int j = 0; j < width; j += elementCount) { \
+        packed = packedLoadFunc(j, in); \
+        \
+        xyz = typed_gather_##floatType(i32gather, packed); \
+        \
+        rgb = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz); \
+        \
+        rgb = maxf(minf(rgb, minValue), maxValue); \
+        \
+        rgb = fpow(rgb, rgbGamma_g); \
+        \
+        yuv = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb); \
+        \
+        packed = addi(ftoi(yuv), yuv2packed_add); \
+        \
+        packedStoreFunc(packed, j, luma, cb, cr); \
+    } \
+};
+
+
+
+#define typed_converter_pump2(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
+fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
+fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
+floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
+\
+void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
+    typed_mat_load_func_##elementCount(setf, m, in) \
+}; \
+\
+\
+inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
+    iPackedType packed0, packed1; \
+    fVectorType xyz0, xyz1; \
+    fVectorType rgb0, rgb1; \
+    fVectorType yuv0, yuv1; \
+    \
+    const fVectorType minValue = set1f(1.); \
+    const fVectorType maxValue = set1f(0.); \
+    const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
+    \
+    const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
+    const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
+    const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
+    \
+    const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
+    const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
+    const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
+    \
+    for (int j = 0; j < width; j += elementCount*2) { \
+        packed0 = packedLoadFunc(j, in); \
+        packed1 = packedLoadFunc(j+elementCount, in); \
+        \
+        xyz0 = typed_gather_##floatType(i32gather, packed0); \
+        xyz1 = typed_gather_##floatType(i32gather, packed1); \
+        \
+        rgb0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz0); \
+        rgb1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz1); \
+        \
+        rgb0 = maxf(minf(rgb0, minValue), maxValue); \
+        rgb1 = maxf(minf(rgb1, minValue), maxValue); \
+        \
+        rgb0 = fpow(rgb0, rgbGamma_g); \
+        rgb1 = fpow(rgb1, rgbGamma_g); \
+        \
+        yuv0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb0); \
+        yuv1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb1); \
+        \
+        packed0 = addi(ftoi(yuv0), yuv2packed_add); \
+        packed1 = addi(ftoi(yuv1), yuv2packed_add); \
+        \
+        packedStoreFunc(packed0, j, luma, cb, cr); \
+        packedStoreFunc(packed1, j+elementCount, luma, cb, cr); \
+    } \
+};
+
+
+#define typed_converter_pump4(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
+fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
+fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
+floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
+\
+void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
+    typed_mat_load_func_##elementCount(setf, m, in) \
+}; \
+\
+\
+inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
+    iPackedType packed0, packed1, packed2, packed3; \
+    fVectorType xyz0, xyz1, xyz2, xyz3; \
+    fVectorType rgb0, rgb1, rgb2, rgb3; \
+    fVectorType yuv0, yuv1, yuv2, yuv3; \
+    \
+    const fVectorType minValue = set1f(1.); \
+    const fVectorType maxValue = set1f(0.); \
+    const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
+    \
+    const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
+    const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
+    const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
+    \
+    const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
+    const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
+    const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
+    \
+    for (int j = 0; j < width; j += elementCount*4) { \
+        packed0 = packedLoadFunc(j, in); \
+        packed1 = packedLoadFunc(j+elementCount, in); \
+        packed2 = packedLoadFunc(j+elementCount*2, in); \
+        packed3 = packedLoadFunc(j+elementCount*3, in); \
+        \
+        xyz0 = typed_gather_##floatType(i32gather, packed0); \
+        xyz1 = typed_gather_##floatType(i32gather, packed1); \
+        xyz2 = typed_gather_##floatType(i32gather, packed2); \
+        xyz3 = typed_gather_##floatType(i32gather, packed3); \
+        \
+        rgb0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz0); \
+        rgb1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz1); \
+        rgb2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz2); \
+        rgb3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz3); \
+        \
+        rgb0 = maxf(minf(rgb0, minValue), maxValue); \
+        rgb1 = maxf(minf(rgb1, minValue), maxValue); \
+        rgb2 = maxf(minf(rgb2, minValue), maxValue); \
+        rgb3 = maxf(minf(rgb3, minValue), maxValue); \
+        \
+        rgb0 = fpow(rgb0, rgbGamma_g); \
+        rgb1 = fpow(rgb1, rgbGamma_g); \
+        rgb2 = fpow(rgb2, rgbGamma_g); \
+        rgb3 = fpow(rgb3, rgbGamma_g); \
+        \
+        yuv0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb0); \
+        yuv1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb1); \
+        yuv2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb2); \
+        yuv3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb3); \
+        \
+        packed0 = addi(ftoi(yuv0), yuv2packed_add); \
+        packed1 = addi(ftoi(yuv1), yuv2packed_add); \
+        packed2 = addi(ftoi(yuv2), yuv2packed_add); \
+        packed3 = addi(ftoi(yuv3), yuv2packed_add); \
+        \
+        packedStoreFunc(packed0, j, luma, cb, cr); \
+        packedStoreFunc(packed1, j+elementCount, luma, cb, cr); \
+        packedStoreFunc(packed2, j+elementCount*2, luma, cb, cr); \
+        packedStoreFunc(packed3, j+elementCount*3, luma, cb, cr); \
+    } \
+};
+
+
+
+#define typed_converter_pump8(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
+fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
+fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
+floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
+\
+void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
+    typed_mat_load_func_##elementCount(setf, m, in) \
+}; \
+\
+\
+inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
+    iPackedType packed0, packed1, packed2, packed3, packed4, packed5, packed6, packed7; \
+    fVectorType xyz0, xyz1, xyz2, xyz3, xyz4, xyz5, xyz6, xyz7; \
+    fVectorType rgb0, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7; \
+    fVectorType yuv0, yuv1, yuv2, yuv3, yuv4, yuv5, yuv6, yuv7; \
+    \
+    const fVectorType minValue = set1f(1.); \
+    const fVectorType maxValue = set1f(0.); \
+    const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
+    \
+    const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
+    const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
+    const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
+    \
+    const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
+    const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
+    const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
+    \
+    for (int j = 0; j < width; j += elementCount*8) { \
+        packed0 = packedLoadFunc(j, in); \
+        packed1 = packedLoadFunc(j+elementCount, in); \
+        packed2 = packedLoadFunc(j+elementCount*2, in); \
+        packed3 = packedLoadFunc(j+elementCount*3, in); \
+        packed4 = packedLoadFunc(j+elementCount*4, in); \
+        packed5 = packedLoadFunc(j+elementCount*5, in); \
+        packed6 = packedLoadFunc(j+elementCount*6, in); \
+        packed7 = packedLoadFunc(j+elementCount*7, in); \
+        \
+        xyz0 = typed_gather_##floatType(i32gather, packed0); \
+        xyz1 = typed_gather_##floatType(i32gather, packed1); \
+        xyz2 = typed_gather_##floatType(i32gather, packed2); \
+        xyz3 = typed_gather_##floatType(i32gather, packed3); \
+        xyz4 = typed_gather_##floatType(i32gather, packed4); \
+        xyz5 = typed_gather_##floatType(i32gather, packed5); \
+        xyz6 = typed_gather_##floatType(i32gather, packed6); \
+        xyz7 = typed_gather_##floatType(i32gather, packed7); \
+        \
+        rgb0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz0); \
+        rgb1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz1); \
+        rgb2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz2); \
+        rgb3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz3); \
+        rgb4 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz4); \
+        rgb5 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz5); \
+        rgb6 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz6); \
+        rgb7 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz7); \
+        \
+        rgb0 = maxf(minf(rgb0, minValue), maxValue); \
+        rgb1 = maxf(minf(rgb1, minValue), maxValue); \
+        rgb2 = maxf(minf(rgb2, minValue), maxValue); \
+        rgb3 = maxf(minf(rgb3, minValue), maxValue); \
+        rgb4 = maxf(minf(rgb4, minValue), maxValue); \
+        rgb5 = maxf(minf(rgb5, minValue), maxValue); \
+        rgb6 = maxf(minf(rgb6, minValue), maxValue); \
+        rgb7 = maxf(minf(rgb7, minValue), maxValue); \
+        \
+        rgb0 = fpow(rgb0, rgbGamma_g); \
+        rgb1 = fpow(rgb1, rgbGamma_g); \
+        rgb2 = fpow(rgb2, rgbGamma_g); \
+        rgb3 = fpow(rgb3, rgbGamma_g); \
+        rgb4 = fpow(rgb4, rgbGamma_g); \
+        rgb5 = fpow(rgb5, rgbGamma_g); \
+        rgb6 = fpow(rgb6, rgbGamma_g); \
+        rgb7 = fpow(rgb7, rgbGamma_g); \
+        \
+        yuv0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb0); \
+        yuv1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb1); \
+        yuv2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb2); \
+        yuv3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb3); \
+        yuv4 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb4); \
+        yuv5 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb5); \
+        yuv6 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb6); \
+        yuv7 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb7); \
+        \
+        packed0 = addi(ftoi(yuv0), yuv2packed_add); \
+        packed1 = addi(ftoi(yuv1), yuv2packed_add); \
+        packed2 = addi(ftoi(yuv2), yuv2packed_add); \
+        packed3 = addi(ftoi(yuv3), yuv2packed_add); \
+        packed4 = addi(ftoi(yuv4), yuv2packed_add); \
+        packed5 = addi(ftoi(yuv5), yuv2packed_add); \
+        packed6 = addi(ftoi(yuv6), yuv2packed_add); \
+        packed7 = addi(ftoi(yuv7), yuv2packed_add); \
+        \
+        packedStoreFunc(packed0, j, luma, cb, cr); \
+        packedStoreFunc(packed1, j+elementCount, luma, cb, cr); \
+        packedStoreFunc(packed2, j+elementCount*2, luma, cb, cr); \
+        packedStoreFunc(packed3, j+elementCount*3, luma, cb, cr); \
+        packedStoreFunc(packed4, j+elementCount*4, luma, cb, cr); \
+        packedStoreFunc(packed5, j+elementCount*5, luma, cb, cr); \
+        packedStoreFunc(packed6, j+elementCount*6, luma, cb, cr); \
+        packedStoreFunc(packed7, j+elementCount*7, luma, cb, cr); \
+    } \
+};
+
+#define typed_func(platform, floatType, elementCount, func) typed_func_##platform##_##func(floatType, elementCount)
+
+#define typed_vector_type(platform, floatType, elementCount) typed_vector_type_##platform(floatType, elementCount)
+#define typed_vector_int_type(platform, floatType, elementCount) typed_vector_int_type_##platform(floatType, elementCount)
+
+#define typed_vector_int_size(platform, floatType, elementCount) typed_vector_int_size_##platform##_##floatType##_##elementCount
+
+#define typed_vector_int_size(platform, floatType, elementCount) typed_vector_int_size_##platform##_##floatType##_##elementCount
+
+#define typed_vector_int_func(platform, floatType, elementCount) typed_vector_int_funcsize_##platform##_##floatType##_##elementCount
+
+#define typed_converter_pumps(pumps) concat(typed_converter_pump, pumps)
+
+#define typed_converter_platform(platform, pumps, floatType, elementCount, hasFMA, packedLoadFunc, packedStoreFunc) \
+typed_converter_pumps(pumps)(floatType, \
+typed_vector_type(platform, floatType, elementCount), \
+typed_vector_int_type(platform, floatType, elementCount), \
+elementCount, \
+typed_func(platform, floatType, elementCount, set1), \
+typed_func(platform, floatType, elementCount, set), \
+typed_func(platform, floatType, elementCount, seti), \
+typed_func(platform, floatType, elementCount, fmadd), \
+typed_func(platform, floatType, elementCount, mul), \
+typed_func(platform, floatType, elementCount, add), \
+hasFMA, \
+typed_func(platform, floatType, elementCount, permute_lanes), \
+typed_func(platform, floatType, elementCount, i32gather), \
+typed_func(platform, floatType, elementCount, min), \
+typed_func(platform, floatType, elementCount, max), \
+typed_func(platform, floatType, elementCount, addi), \
+typed_func(platform, floatType, elementCount, ftoi), \
+typed_func(platform, floatType, elementCount, pow1), \
+packedLoadFunc, \
+packedStoreFunc \
+)
+
+
--- a/conv/conv_generic.h
+++ b/conv/conv_generic.h
@ -0,0 +1,58 @@
+
+#define xyz12_to_linear_name xyz12_to_linear_
+#define xyz12_to_linear_type(fType) concat(xyz12_to_linear_name, fType)
+
+fType xyz12_to_linear_type(fType)[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64)));
+
+#define clamp(v, min, max) (v < min ? min : (v > max ? max : v))
+
+#define M(m, i, j) (m[i*3+j])
+
+#define mxv_step_name mxv_step_
+#define _line_name convert_line_dci_xyz12_to_yuv16_
+
+inline __attribute__((always_inline)) fType concat(mxv_step_name, fType) (const double* matrix, int step, fType a, fType b, fType c) {
+    return M(matrix, step, 0)*a + M(matrix, step, 1)*b + M(matrix, step, 2)*c;
+}
+
+inline __attribute__((always_inline)) void concat(_line_name, fType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) {
+    fType xyz[3] __attribute__((aligned(32)));
+    fType rgb[3] __attribute__((aligned(32)));
+    fType yuv[3] __attribute__((aligned(32)));
+
+    const fType rgbGamma_l = rgbGamma_g;
+
+    for (int j = 0; j < width; j++) {
+        xyz[0] = xyz12_to_linear_type(fType)[in[j*3] << (XYZ_LOOKUP_TABLE_SIZE - 12)];
+        xyz[1] = xyz12_to_linear_type(fType)[in[j*3+1] << (XYZ_LOOKUP_TABLE_SIZE - 12)];
+        xyz[2] = xyz12_to_linear_type(fType)[in[j*3+2] << (XYZ_LOOKUP_TABLE_SIZE - 12)];
+
+        rgb[0] = concat(mxv_step_name, fType)(xyz2rgb_g, 0, xyz[0], xyz[1], xyz[2]);
+        rgb[1] = concat(mxv_step_name, fType)(xyz2rgb_g, 1, xyz[0], xyz[1], xyz[2]);
+        rgb[2] = concat(mxv_step_name, fType)(xyz2rgb_g, 2, xyz[0], xyz[1], xyz[2]);
+
+        rgb[0] = clamp(rgb[0], 0., 1.);
+        rgb[1] = clamp(rgb[1], 0., 1.);
+        rgb[2] = clamp(rgb[2], 0., 1.);
+
+        // todo: optimize this via https://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent ?
+
+        if (sizeof(fType) == 4){
+            rgb[0] = powf(rgb[0], rgbGamma_l);
+            rgb[1] = powf(rgb[1], rgbGamma_l);
+            rgb[2] = powf(rgb[2], rgbGamma_l);
+        } else if (sizeof(fType) == 8){
+            rgb[0] = pow(rgb[0], rgbGamma_l);
+            rgb[1] = pow(rgb[1], rgbGamma_l);
+            rgb[2] = pow(rgb[2], rgbGamma_l);
+        }
+
+        yuv[0] = concat(mxv_step_name, fType) (rgb2yuv_g, 0, rgb[0], rgb[1], rgb[2]);
+        yuv[1] = concat(mxv_step_name, fType) (rgb2yuv_g, 1, rgb[0], rgb[1], rgb[2]);
+        yuv[2] = concat(mxv_step_name, fType) (rgb2yuv_g, 2, rgb[0], rgb[1], rgb[2]);
+
+        luma[j] = clamp((int)(round(yuv[0])), 0, uint16Max);
+        cb[j] = clamp((int)(round(yuv[1])) + int16MaxPlusOne, 0, uint16Max);
+        cr[j] = clamp((int)(round(yuv[2])) + int16MaxPlusOne, 0, uint16Max);
+    }
+}
--- a/conv/simd_utils
+++ b/conv/simd_utils
@ -0,0 +1 @@
+Subproject commit e0aa01336b63d0c9d351a09dc24e0b22483219ad
--- a/convert.go
+++ b/convert.go
@ -0,0 +1,80 @@
+package main
+
+import (
+	"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/colorspace"
+	"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/conv"
+	"gonum.org/v1/gonum/mat"
+)
+
+var useCConverter, useFloat bool
+
+func ConvertFrame(in []uint32, y, cb, cr []uint16, width, height int) {
+	if useCConverter {
+		if useFloat {
+			conv.ConvertFrameDCIXYZToYUV16Float(in, y, cb, cr, width, height)
+		} else {
+			conv.ConvertFrameDCIXYZToYUV16Double(in, y, cb, cr, width, height)
+		}
+		return
+	}
+
+	for i := 0; i < height; i++ {
+		ConvertLine(in, y, cb, cr, width, height)
+
+		in = in[width*3:]
+
+		y = y[width:]
+		cb = cb[width:]
+		cr = cr[width:]
+	}
+}
+
+func ConvertLine(in []uint32, y, cb, cr []uint16, width, height int) {
+	if useCConverter {
+		if useFloat {
+			conv.ConvertLineDCIXYZToYUV16Float(in, y, cb, cr, width, height)
+		} else {
+			conv.ConvertLineDCIXYZToYUV16Double(in, y, cb, cr, width, height)
+		}
+		return
+	}
+
+	var r, g, b float64
+	xyz, rgb, yuv := mat.NewVecDense(3, nil), mat.NewVecDense(3, nil), mat.NewVecDense(3, nil)
+
+	for j := 0; j < width; j++ {
+		// LUT
+		xyz.SetVec(0, colorspace.DCIXYZSystem.ToLinearLUT[uint16(in[j*3])<<4])
+		xyz.SetVec(1, colorspace.DCIXYZSystem.ToLinearLUT[uint16(in[j*3+1])<<4])
+		xyz.SetVec(2, colorspace.DCIXYZSystem.ToLinearLUT[uint16(in[j*3+2])<<4])
+
+		//TODO: apply white point correction here if necessary (but after denorm)
+
+		//denormalize + xyz2rgb
+		rgb.MulVec(xyz2rgbDenorm, xyz)
+
+		//todo: some out of bounds r,g,b come up from here, maybe just fine
+		//clamp values into proper values. necessary due to XYZ ranges, todo: check conversion matrix for preserving this
+		r = min(1.0, max(0, rgb.AtVec(0)))
+		g = min(1.0, max(0, rgb.AtVec(1)))
+		b = min(1.0, max(0, rgb.AtVec(2)))
+
+		// companding / adjustment with gamma curve
+		//TODO: why is it not using normal Rec709 and instead using straight gamma curve
+		r = space.FromLinear(r)
+		g = space.FromLinear(g)
+		b = space.FromLinear(b)
+
+		rgb.SetVec(0, r)
+		rgb.SetVec(1, g)
+		rgb.SetVec(2, b)
+
+		yuv.MulVec(rgb2yuv, rgb)
+
+		// map RGB to components
+		// scale float range to 16bit precision, in full swing
+		y[j] = LumaToFull16(yuv.AtVec(0))
+		cb[j] = ChromaToFull16(yuv.AtVec(1))
+		cr[j] = ChromaToFull16(yuv.AtVec(2))
+	}
+}
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,5 @@
+module git.gammaspectra.live/WeebDataHoarder/xyz2yuv
+
+go 1.21
+
+require gonum.org/v1/gonum v0.14.0
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,4 @@
+golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
+golang.org/x/exp v0.0.0-20230321023759-10a507213a29/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
+gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0=
+gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU=
--- a/job.go
+++ b/job.go
@ -0,0 +1,27 @@
+package main
+
+import (
+	"sync"
+)
+
+type frameJobData struct {
+	wg     *sync.WaitGroup
+	frame  int
+	height int
+	width  int
+	in     []uint32
+	y      []uint16
+	cb     []uint16
+	cr     []uint16
+}
+
+func (job frameJobData) Process() {
+	defer job.wg.Done()
+
+	ConvertFrame(job.in, job.y, job.cb, job.cr, job.width, job.height)
+}
+
+type decodedFrame struct {
+	Number int
+	Frame  []uint32
+}
--- a/libav/libav.go
+++ b/libav/libav.go
@ -0,0 +1,102 @@
+package libav
+
+/*
+#cgo pkg-config: libavformat libavcodec libavutil
+
+#include <libavformat/avformat.h>
+*/
+import "C"
+import (
+	"errors"
+	"io"
+	"slices"
+	"unsafe"
+)
+
+type PacketData struct {
+	Number int
+	Data   []byte
+}
+
+func OpenXYZ12(inputFile string, initFunc func(framerateNum, framerateDen, sarNum, sarDen, width, height int) error, packetFunc func(packet PacketData) error) error {
+	var fmtCtx *C.AVFormatContext
+
+	//Open file and decoder
+
+	inputFileC := C.CString(inputFile)
+	defer C.free(unsafe.Pointer(inputFileC))
+
+	ret := C.avformat_open_input(&fmtCtx, inputFileC, nil, nil)
+	if ret < 0 {
+		panic("cannot open input file")
+	}
+	defer C.avformat_close_input(&fmtCtx)
+
+	ret = C.avformat_find_stream_info(fmtCtx, nil)
+	if ret < 0 {
+		panic("cannot find stream information")
+	}
+
+	//get video
+	ret = C.av_find_best_stream(fmtCtx, C.AVMEDIA_TYPE_VIDEO, -1, -1, nil, 0)
+	if ret < 0 {
+		panic("cannot find video stream")
+	}
+
+	videoStreamIndex := ret
+
+	inputStream := unsafe.Slice(fmtCtx.streams, fmtCtx.nb_streams)[videoStreamIndex]
+
+	codecPar := inputStream.codecpar
+
+	if codecPar.codec_id != C.AV_CODEC_ID_JPEG2000 {
+		panic("video codec not JPEG2000")
+	}
+
+	if codecPar.format != C.AV_PIX_FMT_XYZ12LE {
+		panic("video format not xyz12le")
+	}
+
+	codecPar.color_range = C.AVCOL_RANGE_JPEG
+	codecPar.color_primaries = C.AVCOL_PRI_SMPTE431
+	//codecPar.color_trc = C.AVCOL_PRI_SMPTE431
+	//codecPar.color_space = C.AVCOL_PRI_SMPTE431
+
+	var packet *C.AVPacket
+	packet = C.av_packet_alloc()
+	if packet == nil {
+		panic("err allocating")
+	}
+	defer C.av_packet_free(&packet)
+
+	err := initFunc(int(inputStream.codec.framerate.num), int(inputStream.codec.framerate.den), int(inputStream.sample_aspect_ratio.num), int(inputStream.sample_aspect_ratio.den), int(codecPar.width), int(codecPar.height))
+	if err != nil {
+		return err
+	}
+
+	var frameNumber int
+
+	for {
+		ret = C.av_read_frame(fmtCtx, packet)
+		if ret < 0 {
+			break
+		}
+		if packet.stream_index == videoStreamIndex {
+			err = packetFunc(PacketData{
+				Number: frameNumber,
+				Data:   slices.Clone(unsafe.Slice((*byte)(packet.data), int(packet.size))),
+			})
+			if err != nil {
+				if errors.Is(err, io.EOF) {
+					break
+				}
+				return err
+			}
+
+			frameNumber++
+		}
+		C.av_packet_unref(packet)
+	}
+
+	return nil
+}
--- a/libopenjp2/libopenjp2.go
+++ b/libopenjp2/libopenjp2.go
@ -0,0 +1,141 @@
+package libopenjp2
+
+/*
+#cgo pkg-config: libopenjp2
+
+#include "libopenjp2.h"
+*/
+import "C"
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"runtime"
+	"slices"
+	"unsafe"
+)
+
+//FFMpeg removed libopenjpeg decoder https://github.com/FFmpeg/FFmpeg/commit/60ccb3fe787be3bb10fc4545b3593cd1e0b769ed
+
+const Jp2SigType = 0x6A502020
+const Jp2SigValue = 0x0D0A870A
+
+type Jpeg2000Decoder struct {
+	ctx C.opj_dparameters_t
+}
+
+const QualityLayersAll = 0
+const ResolutionLayersAll = 0
+
+func NewJpeg2000Decoder(qualityLayers, resolutionLayers uint) (*Jpeg2000Decoder, error) {
+	d := &Jpeg2000Decoder{}
+	C.opj_set_default_decoder_parameters(&d.ctx)
+
+	d.ctx.cp_layer = C.uint32_t(qualityLayers)
+	d.ctx.cp_reduce = C.uint32_t(resolutionLayers)
+
+	return d, nil
+}
+
+var jp2c = binary.LittleEndian.Uint32([]byte("jp2c"))
+
+type Jpeg2000Frame struct {
+	Width, Height int
+	X, Y, Z       []uint32
+}
+
+func (d *Jpeg2000Decoder) DecodeFrame(buf []byte) (frame *Jpeg2000Frame, err error) {
+	var image *C.opj_image_t
+	var dec *C.opj_codec_t
+	var stream *C.opj_stream_t
+
+	var pinner runtime.Pinner
+	pinner.Pin(unsafe.Pointer(unsafe.SliceData(buf)))
+	defer pinner.Unpin()
+
+	// Check if input is a raw jpeg2k codestream or in jp2 wrapping
+	if (binary.LittleEndian.Uint32(buf) == 12) &&
+		(binary.LittleEndian.Uint32(buf[4:]) == Jp2SigType) &&
+		(binary.LittleEndian.Uint32(buf[8:]) == Jp2SigValue) {
+		dec = C.opj_create_decompress(C.OPJ_CODEC_JP2)
+	} else {
+		/* If the AVPacket contains a jp2c box, then skip to
+		 * the starting byte of the codestream. */
+		if binary.LittleEndian.Uint32(buf[4:]) == jp2c {
+			buf = buf[8:]
+		}
+		dec = C.opj_create_decompress(C.OPJ_CODEC_J2K)
+	}
+
+	if dec == nil {
+		return nil, errors.New("error initializing decoder")
+	}
+	defer C.opj_destroy_codec(dec)
+
+	// Tie decoder with decoding parameters
+	C.opj_setup_decoder(dec, &d.ctx)
+
+	stream = C.opj_stream_default_create(C.OPJ_STREAM_READ)
+	defer C.opj_stream_destroy(stream)
+
+	if stream == nil {
+		return nil, errors.New("error initializing stream")
+	}
+
+	reader := &C.BufferReader{
+		pos:    0,
+		size:   C.int(len(buf)),
+		buffer: (*C.uchar)(unsafe.Pointer(unsafe.SliceData(buf))),
+	}
+
+	pinner.Pin(reader)
+
+	C.set_stream_callbacks(stream)
+	C.opj_stream_set_user_data(stream, unsafe.Pointer(reader), nil)
+	C.opj_stream_set_user_data_length(stream, C.ulong(len(buf)))
+
+	ret := C.opj_read_header(stream, dec, &image)
+	defer C.opj_image_destroy(image)
+
+	if ret != 1 {
+		return nil, errors.New("error decoding stream header")
+	}
+
+	if image.numcomps != 3 {
+		return nil, fmt.Errorf("unexpected component number %d", image.numcomps)
+	}
+
+	components := unsafe.Slice(image.comps, int(image.numcomps))
+
+	for i, c := range components {
+		if c.prec != 12 {
+			return nil, fmt.Errorf("unexpected component %d bit depth %d", i, c.prec)
+		}
+	}
+
+	ret = C.opj_decode(dec, stream, image)
+
+	if ret != 1 {
+		return nil, errors.New("error decoding image")
+	}
+
+	for i, c := range components {
+		if c.data == nil {
+			return nil, fmt.Errorf("component %d has no data", i)
+		}
+	}
+
+	readComponent := func(index int) []uint32 {
+		return unsafe.Slice((*uint32)(unsafe.Pointer(components[index].data)), int(components[index].h)*int(components[index].w))
+	}
+
+	frame = &Jpeg2000Frame{
+		Width:  int(components[0].w),
+		Height: int(components[0].h),
+		X:      slices.Clone(readComponent(0)),
+		Y:      slices.Clone(readComponent(1)),
+		Z:      slices.Clone(readComponent(2)),
+	}
+
+	return frame, nil
+}
--- a/libopenjp2/libopenjp2.h
+++ b/libopenjp2/libopenjp2.h
@ -0,0 +1,67 @@
+#include <string.h>
+
+#include <openjpeg.h>
+
+typedef struct BufferReader {
+    int pos;
+    int size;
+    const uint8_t *buffer;
+} BufferReader;
+
+static OPJ_SIZE_T stream_read(void *out_buffer, OPJ_SIZE_T nb_bytes, void *user_data) {
+    BufferReader *reader = user_data;
+    int remaining;
+
+    if (reader->pos == reader->size) {
+        return (OPJ_SIZE_T)-1;
+    }
+    remaining = reader->size - reader->pos;
+    if (nb_bytes > remaining) {
+        nb_bytes = remaining;
+    }
+    memcpy(out_buffer, reader->buffer + reader->pos, nb_bytes);
+    reader->pos += (int)nb_bytes;
+    return nb_bytes;
+}
+
+static OPJ_OFF_T stream_skip(OPJ_OFF_T nb_bytes, void *user_data)
+{
+    BufferReader *reader = user_data;
+    if (nb_bytes < 0) {
+        if (reader->pos == 0) {
+            return (OPJ_SIZE_T)-1;
+        }
+        if (nb_bytes + reader->pos < 0) {
+            nb_bytes = -reader->pos;
+        }
+    } else {
+        int remaining;
+
+        if (reader->pos == reader->size) {
+            return (OPJ_SIZE_T)-1;
+        }
+        remaining = reader->size - reader->pos;
+        if (nb_bytes > remaining) {
+            nb_bytes = remaining;
+        }
+    }
+    reader->pos += (int)nb_bytes;
+    return nb_bytes;
+}
+
+static OPJ_BOOL stream_seek(OPJ_OFF_T nb_bytes, void *user_data)
+{
+    BufferReader *reader = user_data;
+    if (nb_bytes < 0 || nb_bytes > reader->size) {
+        return OPJ_FALSE;
+    }
+    reader->pos = (int)nb_bytes;
+    return OPJ_TRUE;
+}
+
+
+static void set_stream_callbacks(opj_stream_t* stream) {
+	opj_stream_set_read_function(stream, stream_read);
+	opj_stream_set_skip_function(stream, stream_skip);
+	opj_stream_set_seek_function(stream, stream_seek);
+}
--- a/parameters.go
+++ b/parameters.go
@ -0,0 +1,29 @@
+package main
+
+import (
+	"gonum.org/v1/gonum/mat"
+	"math"
+)
+
+func LumaToFull16(l float64) uint16 {
+	return uint16(min(math.MaxUint16, max(0, int(math.Round(l*math.MaxUint16)))))
+}
+
+func ChromaToFull16(c float64) uint16 {
+	return uint16(min(math.MaxUint16, max(0, int(math.Round(c*math.MaxUint16+math.MaxInt16+1)))))
+}
+
+// RoundMatToPrecision Applies rounding to each Matrix entry to limit precision
+func RoundMatToPrecision(m *mat.Dense, decimals int) *mat.Dense {
+	var o mat.Dense
+	if decimals <= 0 {
+		o.CloneFrom(m)
+		return &o
+	}
+	factor := 10. * float64(decimals)
+
+	o.Apply(func(i, j int, v float64) float64 {
+		return math.Round(v*factor) / factor
+	}, m)
+	return &o
+}
--- a/scripts/README.md
+++ b/scripts/README.md
@ -0,0 +1,5 @@
+# Script collection for playback
+
+```shell
+$ [script].sh "input.mkv" 
+```
--- a/scripts/playback-mpv.sh
+++ b/scripts/playback-mpv.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+
+# Playbacks an XYZ input file straight via mpv
+
+cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
+
+../bin/xyz2yuv \
+-in "${1}" \
+-colorspace "${2}" \
+-out - | mpv \
+--demuxer-max-bytes=4096MiB --cache=yes --cache-secs=30 \
+--force-seekable=yes - \
+--external-file="${1}" \
+--vid=1 --aid=1
--- a/scripts/playback-rec709-mpv.sh
+++ b/scripts/playback-rec709-mpv.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+
+# Playbacks an XYZ input file straight via mpv
+
+cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
+
+./playback-mpv.sh "${1}" rec709_pure
--- a/scripts/playback-rec709_22-mpv.sh
+++ b/scripts/playback-rec709_22-mpv.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+
+# Playbacks an XYZ input file straight via mpv
+
+cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
+
+./playback-mpv.sh "${1}" rec709_pure22
--- a/scripts/playback-rec709_24-mpv.sh
+++ b/scripts/playback-rec709_24-mpv.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+
+# Playbacks an XYZ input file straight via mpv
+
+cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
+
+./playback-mpv.sh "${1}" rec709_pure24
--- a/xyz2yuv.go
+++ b/xyz2yuv.go
@ -0,0 +1,427 @@
+package main
+
+import "C"
+import (
+	"crypto/sha256"
+	"encoding/hex"
+	"flag"
+	"fmt"
+	"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/colorspace"
+	"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/conv"
+	"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/libav"
+	"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/libopenjp2"
+	"gonum.org/v1/gonum/mat"
+	"io"
+	"math"
+	"os"
+	"runtime"
+	"slices"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+	"unsafe"
+)
+
+var space colorspace.RelativeSystem
+
+var xyz2rgb *mat.Dense
+var xyz2rgbDenorm *mat.Dense
+
+var rgb2yuv *mat.Dense
+var rgb2yuvPremultiplied *mat.Dense
+
+var rgbGamma float64
+
+func ToPacked(a, b, c []uint32, extra int) []uint32 {
+	if len(a) != len(b) || len(a) != len(c) {
+		panic("lengths mismatch")
+	}
+
+	out := make([]uint32, len(a)*3, len(a)*3+extra)
+	for i := range a {
+		out[i*3] = a[i]
+		out[i*3+1] = b[i]
+		out[i*3+2] = c[i]
+	}
+	return out
+}
+
+func main() {
+	inFile := flag.String("in", "", "Input file")
+	startFrame := flag.Uint64("start", 0, "Start frame number inclusive")
+	endFrame := flag.Uint64("end", math.MaxUint64, "End frame number exclusive")
+	outFile := flag.String("out", "-", "Output file. Use - for stdout")
+	colorspaceRelativeSystem := flag.String("colorspace", "rec709_pure", "Colorspace and parameters to convert into. Supported: rec709, rec709_pure, rec709_pure22, rec709_pure24, rec2020, rec2020_pure, rec2020_pure24")
+	xyzPrecision := flag.Int("precision-xyz2rgb", 0, "XYZ -> RGB conversion matrix precision. 0 = maximum")
+	rgbPrecision := flag.Int("precision-rgb2yuv", 0, "RGB -> YUV conversion matrix precision. 0 = maximum")
+	lowres := flag.Uint("lowres", 0, "Feed lowres parameter. Default is full frame")
+	useFloatPipeline := flag.Bool("float", false, "Use float pipeline instead of double, although less precise. Very fast.")
+	useGoPipeline := flag.Bool("use-go-pipeline", false, "Use Go pipeline, although slower. Does not support float mode.")
+	hashOutput := flag.Bool("hash", false, "Hash with SHA256 each output frame for accuracy comparisons")
+	decoderThreads := flag.Uint("decoder-threads", 0, "Threads for JPEG2000 decoding. Defaults to number of logical CPU")
+	pipelineThreads := flag.Uint("pipeline-threads", 0, "Threads for colorspace conversion pipeline. Defaults to number of logical CPU")
+	flag.Parse()
+
+	runtime.KeepAlive(endFrame)
+
+	//C.av_log_set_level(C.AV_LOG_DEBUG)
+
+	numDecoderCpu := int(*decoderThreads)
+	if numDecoderCpu == 0 {
+		numDecoderCpu = runtime.NumCPU()
+	}
+
+	numPipelineCpu := int(*pipelineThreads)
+	if numPipelineCpu == 0 {
+		numPipelineCpu = runtime.NumCPU()
+	}
+
+	useCConverter = !*useGoPipeline
+	useFloat = *useFloatPipeline
+
+	switch strings.ToLower(*colorspaceRelativeSystem) {
+	case "rec709":
+		space = colorspace.SystemRec709
+		rgbGamma = colorspace.GammaRec709
+	case "rec709_pure":
+		space = colorspace.SystemRec709_Pure
+		rgbGamma = colorspace.GammaRec709
+	case "rec709_pure22":
+		space = colorspace.SystemRec709_Pure22
+		rgbGamma = colorspace.Gamma22
+	case "rec709_pure24":
+		space = colorspace.SystemRec709_Pure24
+		rgbGamma = colorspace.Gamma24
+	case "rec2020":
+		space = colorspace.SystemRec2020
+		rgbGamma = colorspace.GammaRec2020
+	case "rec2020_pure":
+		space = colorspace.SystemRec2020_Pure
+		rgbGamma = colorspace.GammaRec2020
+	case "rec2020_pure24":
+		space = colorspace.SystemRec2020_Pure24
+		rgbGamma = colorspace.Gamma24
+
+	default:
+		panic("unsupported colorspace")
+	}
+
+	_, xyz2rgb = space.Chromaticity.ConversionXYZ()
+	_, rgb2yuv = space.YCbCr.ConversionRGB()
+
+	//adjust xyz2rgb with normalization factor from DCI
+	denorm := mat.NewDiagDense(3, []float64{
+		1 / colorspace.DCINormalizationFactor,
+		1 / colorspace.DCINormalizationFactor,
+		1 / colorspace.DCINormalizationFactor,
+	})
+
+	xyz2rgbDenorm = mat.NewDense(3, 3, nil)
+	xyz2rgbDenorm.Mul(denorm, xyz2rgb)
+
+	premult := mat.NewDiagDense(3, []float64{
+		math.MaxUint16,
+		math.MaxUint16,
+		math.MaxUint16,
+	})
+
+	rgb2yuvPremultiplied = mat.NewDense(3, 3, nil)
+	rgb2yuvPremultiplied.Mul(rgb2yuv, premult)
+
+	xyz2rgb = RoundMatToPrecision(xyz2rgb, *xyzPrecision)
+	rgb2yuv = RoundMatToPrecision(rgb2yuv, *rgbPrecision)
+	xyz2rgbDenorm = RoundMatToPrecision(xyz2rgbDenorm, *xyzPrecision)
+	rgb2yuvPremultiplied = RoundMatToPrecision(rgb2yuvPremultiplied, *rgbPrecision)
+
+	_, _ = fmt.Fprintf(os.Stderr, "\nXYZ to RGB matrix:\n%v\n\n", mat.Formatted(xyz2rgb))
+	_, _ = fmt.Fprintf(os.Stderr, "\nXYZ to RGB matrix (denormalized):\n%v\n\n", mat.Formatted(xyz2rgbDenorm))
+	_, _ = fmt.Fprintf(os.Stderr, "\nRGB to YUV matrix:\n%v\n\n", mat.Formatted(rgb2yuv))
+	_, _ = fmt.Fprintf(os.Stderr, "\nRGB to YUV matrix (premultiplied):\n%v\n\n", mat.Formatted(rgb2yuvPremultiplied))
+
+	if useCConverter {
+		_, _ = fmt.Fprintf(os.Stderr, "\nDecoder: CGO %s\n", conv.DecoderInformation())
+	} else {
+		_, _ = fmt.Fprintf(os.Stderr, "\nDecoder: Go Generic scalar pipeline (1d 1f)\n")
+	}
+
+	if useFloat && useCConverter {
+		_, _ = fmt.Fprintf(os.Stderr, "Data type: float32\n\n")
+	} else {
+		_, _ = fmt.Fprintf(os.Stderr, "Data type: float64\n\n")
+	}
+
+	if useCConverter {
+		conv.InitData(xyz2rgbDenorm, rgb2yuvPremultiplied, colorspace.GammaDCIXYZ, rgbGamma)
+	}
+
+	//open and write output file header
+	var output *os.File
+	if *outFile == "-" {
+		output = os.Stdout
+	} else {
+		f, err := os.Create(*outFile)
+		if err != nil {
+			panic(err)
+		}
+		output = f
+	}
+	defer output.Close()
+
+	outputFrame := func(number int, y, cb, cr []uint16) error {
+		by := unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(y))), len(y)*2)
+		bcb := unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(cb))), len(cb)*2)
+		bcr := unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(cr))), len(cr)*2)
+		if *hashOutput {
+			hasher := sha256.New()
+			hasher.Write(by)
+			hasher.Write(bcb)
+			hasher.Write(bcr)
+			fmt.Fprintf(os.Stderr, "\r%s\n", hex.EncodeToString(hasher.Sum(nil)))
+			//fmt.Fprintf(os.Stderr, "\rFrame %d: %s\n", number, hex.EncodeToString(hasher.Sum(nil)))
+		}
+
+		_, err := output.WriteString("FRAME\n")
+		if err != nil {
+			return err
+		}
+		_, err = output.Write(by)
+		if err != nil {
+			return err
+		}
+		_, err = output.Write(bcb)
+		if err != nil {
+			return err
+		}
+		_, err = output.Write(bcr)
+		if err != nil {
+			return err
+		}
+		return nil
+	}
+
+	// decode and processing loop
+	var wg sync.WaitGroup
+
+	availableFrames := make(chan *frameJobData, numPipelineCpu*2)
+	inFrameJobs := make(chan *frameJobData, numPipelineCpu)
+	outFrameJobs := make(chan *frameJobData, numPipelineCpu)
+
+	availableDecoders := make(chan struct{}, numDecoderCpu*2)
+	outDecoderJobs := make(chan *decodedFrame, numPipelineCpu)
+	jpegDecoderChannel := make(chan libav.PacketData, numDecoderCpu)
+
+	var expectedFrame = max(0, *startFrame)
+	var expectedFrameDecoder = expectedFrame
+	var processedFrames atomic.Uint64
+
+	var firstFrame = expectedFrame
+	var firstFrameTime time.Time
+
+	var wg2 sync.WaitGroup
+	for i := 0; i < numPipelineCpu; i++ {
+		wg2.Add(1)
+		go func() {
+			defer wg2.Done()
+			for job := range inFrameJobs {
+				job.Process()
+				processedFrames.Add(1)
+				outFrameJobs <- job
+			}
+		}()
+	}
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		wg2.Wait()
+		close(outFrameJobs)
+	}()
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+
+		outputs := make([]*frameJobData, 0)
+		for out := range outFrameJobs {
+			outputs = append(outputs, out)
+			slices.SortFunc(outputs, func(a, b *frameJobData) int {
+				return a.frame - b.frame
+			})
+
+			for len(outputs) > 0 {
+				f := outputs[0]
+
+				if f.frame != int(expectedFrame) {
+					break
+				}
+
+				//output frame to file
+				err := outputFrame(f.frame, f.y, f.cb, f.cr)
+				if err != nil {
+					panic(err)
+				}
+
+				outputs = slices.Delete(outputs, 0, 1)
+
+				expectedFrame++
+
+				availableFrames <- f
+			}
+		}
+	}()
+
+	decoder, err := libopenjp2.NewJpeg2000Decoder(libopenjp2.QualityLayersAll, *lowres)
+	if err != nil {
+		panic(err)
+	}
+
+	var streamFramerateNum, streamFramerateDen, streamSarNum, streamSarDen int
+
+	var onceInit sync.Once
+	var wgDecoder sync.WaitGroup
+	for i := 0; i < numDecoderCpu*2; i++ {
+		wg.Add(1)
+		wgDecoder.Add(1)
+		go func() {
+			defer wg.Done()
+			defer wgDecoder.Add(1)
+
+			for p := range jpegDecoderChannel {
+
+				frame, err := decoder.DecodeFrame(p.Data)
+				if err != nil {
+					panic(err)
+				}
+
+				onceInit.Do(func() {
+					_, err := output.WriteString(fmt.Sprintf("YUV4MPEG2 W%d H%d F%d:%d I%s A%d:%d%s%s\n",
+						frame.Width,
+						frame.Height,
+						streamFramerateNum,
+						streamFramerateDen,
+						"p",
+						streamSarNum,
+						streamSarDen,
+						" C444p16 XYSCSS=444P16",
+						" XCOLORRANGE=FULL",
+					))
+					if err != nil {
+						panic(err)
+					}
+
+					yuvLineSize := frame.Width
+					yuvFrameSize := frame.Height * yuvLineSize
+
+					for i := 0; i < numPipelineCpu*2; i++ {
+						availableFrames <- &frameJobData{
+							wg:     &wg,
+							frame:  0,
+							width:  frame.Width,
+							height: frame.Height,
+							in:     nil,
+							//add extra capacity for OOB writes in ASM code
+							y:  make([]uint16, yuvFrameSize, yuvFrameSize+64),
+							cb: make([]uint16, yuvFrameSize, yuvFrameSize+64),
+							cr: make([]uint16, yuvFrameSize, yuvFrameSize+64),
+						}
+					}
+					firstFrameTime = time.Now().UTC()
+				})
+
+				outDecoderJobs <- &decodedFrame{
+					Number: p.Number,
+					//add extra capacity for OOB reads in ASM code
+					Frame: ToPacked(frame.X, frame.Y, frame.Z, 256),
+				}
+			}
+		}()
+	}
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		defer close(inFrameJobs)
+
+		outputs := make([]*decodedFrame, 0)
+		for out := range outDecoderJobs {
+			outputs = append(outputs, out)
+			slices.SortFunc(outputs, func(a, b *decodedFrame) int {
+				return a.Number - b.Number
+			})
+
+			for len(outputs) > 0 {
+				frame := outputs[0]
+
+				if frame.Number != int(expectedFrameDecoder) {
+					break
+				}
+
+				f := <-availableFrames
+				f.frame = frame.Number
+				//f.inLineSize = linesize
+
+				f.in = frame.Frame
+				wg.Add(1)
+				inFrameJobs <- f
+
+				outputs = slices.Delete(outputs, 0, 1)
+
+				expectedFrameDecoder++
+
+				availableDecoders <- struct{}{}
+			}
+		}
+	}()
+
+	go func() {
+		for range time.Tick(time.Second) {
+			frame := int(processedFrames.Load())
+			runningTime := time.Now().UTC().Sub(firstFrameTime)
+			fps := float64(frame-int(firstFrame)+1) / runningTime.Seconds()
+			_, _ = fmt.Fprintf(os.Stderr, "\rFrames %d %.02f fps %s     ", frame, fps, runningTime.Truncate(time.Second))
+		}
+	}()
+
+	err = libav.OpenXYZ12(*inFile, func(framerateNum, framerateDen, sarNum, sarDen, width, height int) error {
+		streamFramerateNum = framerateNum
+		streamFramerateDen = framerateDen
+		streamSarNum = sarNum
+		streamSarDen = sarDen
+
+		for i := 0; i < numDecoderCpu*2; i++ {
+			availableDecoders <- struct{}{}
+		}
+
+		return nil
+	}, func(p libav.PacketData) error {
+		if uint64(p.Number) < *startFrame {
+			firstFrameTime = time.Now().UTC()
+			return nil
+		}
+
+		if uint64(p.Number) >= *endFrame {
+			return io.EOF
+		}
+
+		<-availableDecoders
+
+		jpegDecoderChannel <- p
+		return nil
+	})
+	if err != nil {
+		panic(err)
+	}
+	close(jpegDecoderChannel)
+	wgDecoder.Wait()
+	close(outDecoderJobs)
+
+	wg.Wait()
+
+	print("\n\n")
+
+	runningTime := time.Now().UTC().Sub(firstFrameTime)
+	fps := float64(int(processedFrames.Load())-int(firstFrame)+1) / runningTime.Seconds()
+	_, _ = fmt.Fprintf(os.Stderr, "\nTotal %d frames, %.02f fps, took %s     \n", processedFrames.Load(), fps, runningTime.Truncate(time.Millisecond))
+
+}
				`@ -0,0 +1 @@`
				`Subproject commit e0aa01336b63d0c9d351a09dc24e0b22483219ad`