Initial commit

This commit is contained in:
DataHoarder 2024-02-19 20:00:08 +01:00
commit 6eda53859e
Signed by: DataHoarder
SSH key fingerprint: SHA256:OLTRf6Fl87G52SiR7sWLGNzlJt4WOX+tfI2yxo0z7xk
35 changed files with 2442 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/bin/*

3
.gitmodules vendored Normal file
View file

@ -0,0 +1,3 @@
[submodule "conv/simd_utils"]
path = conv/simd_utils
url = https://github.com/JishinMaster/simd_utils.git

19
LICENSE Normal file
View file

@ -0,0 +1,19 @@
Copyright (c) 2024 WeebDataHoarder, xyz2yuv Contributors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

11
README.md Normal file
View file

@ -0,0 +1,11 @@
# xyz2yuv tool
Decode DCI XYZ' and apply conversions to your desired output colorspace. Supports Rec. 709 and Rec. 2020, with adjustable precision and gamma values.
Supports AVX-512 and AVX2 targets, and a generic implementation in C and Go as well.
## Dependencies
* CGO
* libopenjp2-dev
* libavformat-dev
* libavcodec-dev
* libavutil-dev

0
bin/.gitkeep Normal file
View file

14
build.sh Executable file
View file

@ -0,0 +1,14 @@
#!/bin/bash
cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
CMD=go
if [[ "${GOROOT}" != "" ]]; then
CMD="${GOROOT}/bin/go"
fi
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv ./
CGO_CFLAGS="-DSIMD_PUMPS=2" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv_2pump ./
CGO_CFLAGS="-DSIMD_PUMPS=4" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv_4pump ./
CGO_CFLAGS="-DSIMD_PUMPS=8" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv_8pump ./

53
colorspace/adaptation.go Normal file
View file

@ -0,0 +1,53 @@
package colorspace
import "gonum.org/v1/gonum/mat"
type ChromaticAdaptation mat.Dense
func (a ChromaticAdaptation) AdaptXYZ(from, to Illuminant) mat.Matrix {
var fromM, toM *mat.VecDense
{
Xw, Yw, Zw := from.ToXYZ()
fromM = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
}
{
Xw, Yw, Zw := to.ToXYZ()
toM = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
}
var crdFrom, crdTo mat.VecDense
crdFrom.MulVec((*mat.Dense)(&a), fromM)
crdTo.MulVec((*mat.Dense)(&a), toM)
return mat.NewDiagDense(3, []float64{
crdTo.AtVec(0) / crdFrom.AtVec(0),
crdTo.AtVec(1) / crdFrom.AtVec(1),
crdTo.AtVec(2) / crdFrom.AtVec(2),
})
}
var (
ChromaticAdaptationBradford = mat.NewDense(3, 3, []float64{
0.8951, 0.2664, -0.1614,
-0.7502, 1.7135, 0.0367,
0.0389, -0.0685, 1.0296,
})
ChromaticAdaptationCMCCAT2000 = mat.NewDense(3, 3, []float64{
0.7982, 0.3389, -0.1371,
-0.5918, 1.5512, 0.0406,
0.0008, 0.0239, 0.9753,
})
ChromaticAdaptationCIECAT02 = mat.NewDense(3, 3, []float64{
0.7328, 0.4296, -0.1624,
-0.7036, 1.6975, 0.0061,
0.0030, 0.0136, 0.9834,
})
ChromaticAdaptationSharp = mat.NewDense(3, 3, []float64{
1.2694, -0.0988, -0.1706,
-0.8364, 1.8006, 0.0357,
0.0297, -0.0315, 1.0018,
})
)

145
colorspace/chromaticity.go Normal file
View file

@ -0,0 +1,145 @@
package colorspace
import "gonum.org/v1/gonum/mat"
type Chromaticity struct {
Red ColorCoordinate
Green ColorCoordinate
Blue ColorCoordinate
White Illuminant
}
func (c Chromaticity) ConversionXYZ() (to, from *mat.Dense) {
var err error
var RGB *mat.Dense
var W *mat.VecDense
{
Xr, Yr, Zr := c.Red.ToXYZ()
Xg, Yg, Zg := c.Green.ToXYZ()
Xb, Yb, Zb := c.Blue.ToXYZ()
RGB = mat.NewDense(3, 3, []float64{
Xr, Xg, Xb,
Yr, Yg, Yb,
Zr, Zg, Zb,
})
}
{
Xw, Yw, Zw := c.White.ToXYZ()
W = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
}
var tmp1, rgb2xyz, xyz2rgb mat.Dense
var S mat.VecDense
if err = tmp1.Inverse(RGB); err != nil {
panic(err)
}
S.MulVec(&tmp1, W)
M := mat.NewDense(3, 3, []float64{
S.AtVec(0) * RGB.At(0, 0), S.AtVec(1) * RGB.At(0, 1), S.AtVec(2) * RGB.At(0, 2),
S.AtVec(0) * RGB.At(1, 0), S.AtVec(1) * RGB.At(1, 1), S.AtVec(2) * RGB.At(1, 2),
S.AtVec(0) * RGB.At(2, 0), S.AtVec(1) * RGB.At(2, 1), S.AtVec(2) * RGB.At(2, 2),
})
rgb2xyz.CloneFrom(M)
if err = xyz2rgb.Inverse(M); err != nil {
panic(err)
}
return &rgb2xyz, &xyz2rgb
}
/*
func (c Chromaticity) XYZToRGB(connectionSpaceWhite Illuminant, adaptation ChromaticAdaptation) {
var err error
var RGB *mat.Dense
var W1, W2 *mat.VecDense
{
Xr, Yr, Zr := c.Red.ToXYZ()
Xg, Yg, Zg := c.Green.ToXYZ()
Xb, Yb, Zb := c.Blue.ToXYZ()
RGB = mat.NewDense(3, 3, []float64{
Xr, Xg, Xb,
Yr, Yg, Yb,
Zr, Zg, Zb,
})
}
{
Xw, Yw, Zw := c.White.ToXYZ()
W1 = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
}
{
Xw, Yw, Zw := connectionSpaceWhite.ToXYZ()
W2 = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
}
var tmp1, tmp2, M, Mc, M2, rgb2xyz, xyz2rgb, source_destination_whites, destination_source_whites, adapted_rgb2xyz_2, adapted_xyz2rgb_2 mat.Dense
var S, crdS, crdD, RA mat.VecDense
if err = tmp1.Inverse(RGB); err != nil {
panic(err)
}
S.MulVec(&tmp1, W1)
//TODO
M.Mul(&S, RGB)
rgb2xyz.CloneFrom(&M)
if err = xyz2rgb.Inverse(&M); err != nil {
panic(err)
}
// chromatic adaptation
crdS.MulVec((*mat.Dense)(&adaptation), W1)
crdD.MulVec((*mat.Dense)(&adaptation), W2)
Mt := mat.NewDiagDense(3, []float64{
crdD.AtVec(0) / crdS.AtVec(0),
crdD.AtVec(1) / crdS.AtVec(1),
crdD.AtVec(2) / crdS.AtVec(2),
})
if err = tmp1.Inverse((*mat.Dense)(&adaptation)); err != nil {
panic(err)
}
tmp2.Mul(&tmp1, Mt)
Mc.Mul(&tmp2, (*mat.Dense)(&adaptation))
source_destination_whites.CloneFrom(&Mc)
if err = destination_source_whites.Inverse(&Mc); err != nil {
panic(err)
}
M2.Mul(&Mc, &M)
adapted_rgb2xyz_2.CloneFrom(&M2)
if err = adapted_xyz2rgb_2.Inverse(&M2); err != nil {
panic(err)
}
RA.MulVec(&Mc, W1)
}
*/

18
colorspace/color.go Normal file
View file

@ -0,0 +1,18 @@
package colorspace
type ColorCoordinate [2]float64
func (c ColorCoordinate) X() float64 {
return c[0]
}
func (c ColorCoordinate) Y() float64 {
return c[1]
}
func (c ColorCoordinate) ToXYZ() (X, Y, Z float64) {
X = c[0] / c[1]
Y = 1.0
Z = (1 - c[0] - c[1]) / c[1]
return
}

18
colorspace/illuminant.go Normal file
View file

@ -0,0 +1,18 @@
package colorspace
type Illuminant = ColorCoordinate
// Standard Illuminants in 2 degree form
var (
IlluminantD50 = Illuminant{0.34567, 0.35850}
IlluminantD55 = Illuminant{0.33242, 0.34743}
// IlluminantD60 P3-D60 (ACES Cinema)
IlluminantD60 = Illuminant{0.32168, 0.33767}
// IlluminantD63 P3-DCI (Theater)
IlluminantD63 = Illuminant{0.314, 0.351}
// IlluminantD65 Standard D65 for Rec. 709, Rec. 2020, sRGB and many more
IlluminantD65 = Illuminant{0.31271, 0.32902}
)

79
colorspace/relative.go Normal file
View file

@ -0,0 +1,79 @@
package colorspace
type RelativeSystem struct {
Chromaticity Chromaticity
fromLinearTransfer TransferFunction
YCbCr YCbCrConverter
}
func (s RelativeSystem) FromLinear(c float64) float64 {
return s.fromLinearTransfer(c)
}
func NewRelativeSystem(chromaticity Chromaticity, fromLinearTransfer TransferFunction, converter YCbCrConverter) RelativeSystem {
return RelativeSystem{
Chromaticity: chromaticity,
fromLinearTransfer: fromLinearTransfer,
YCbCr: converter,
}
}
var (
SystemSRGB = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.640, 0.330},
Green: ColorCoordinate{0.300, 0.600},
Blue: ColorCoordinate{0.150, 0.060},
White: IlluminantD65,
}, CompandingSRGB, YCbCr_Rec709)
SystemRec709 = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.640, 0.330},
Green: ColorCoordinate{0.300, 0.600},
Blue: ColorCoordinate{0.150, 0.060},
White: IlluminantD65,
}, CompandingRec709, YCbCr_Rec709)
SystemRec709_Pure = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.640, 0.330},
Green: ColorCoordinate{0.300, 0.600},
Blue: ColorCoordinate{0.150, 0.060},
White: IlluminantD65,
}, PureRec709, YCbCr_Rec709)
SystemRec709_Pure22 = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.640, 0.330},
Green: ColorCoordinate{0.300, 0.600},
Blue: ColorCoordinate{0.150, 0.060},
White: IlluminantD65,
}, PureRec709_22, YCbCr_Rec709)
SystemRec709_Pure24 = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.640, 0.330},
Green: ColorCoordinate{0.300, 0.600},
Blue: ColorCoordinate{0.150, 0.060},
White: IlluminantD65,
}, PureRec709_24, YCbCr_Rec709)
SystemRec2020 = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.708, 0.292},
Green: ColorCoordinate{0.170, 0.797},
Blue: ColorCoordinate{0.131, 0.046},
White: IlluminantD65,
}, CompandingRec2020, YCbCr_Rec2020)
SystemRec2020_Pure = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.708, 0.292},
Green: ColorCoordinate{0.170, 0.797},
Blue: ColorCoordinate{0.131, 0.046},
White: IlluminantD65,
}, PureRec2020, YCbCr_Rec2020)
SystemRec2020_Pure24 = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.708, 0.292},
Green: ColorCoordinate{0.170, 0.797},
Blue: ColorCoordinate{0.131, 0.046},
White: IlluminantD65,
}, PureRec2020, YCbCr_Rec2020)
)

66
colorspace/types.go Normal file
View file

@ -0,0 +1,66 @@
package colorspace
import "math"
type TransferFunction func(e float64) float64
const DCINormalizationFactor = 48 / 52.37
const (
Gamma22 = 2.2
Gamma24 = 2.4
GammaDCIXYZ = 2.6
GammaSRGB = Gamma22
GammaRec709 = 1 / 0.45
GammaRec2020 = GammaRec709
)
// https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.2020-2-201510-I!!PDF-E.pdf
const alpha = 1.09929682680944 // 10 * Math.pow(beta, 0.55)
const beta = 0.018053968510807
var (
TransferFromDCIXYZ TransferFunction = func(e float64) float64 {
return e / DCINormalizationFactor
}
CompandingSRGB TransferFunction = func(e float64) float64 {
if e <= 0.0031308 {
return 12.92 * e
} else {
return 1.055*math.Pow(e, 1/GammaSRGB) - 0.055
}
}
// CompandingRec709 ITU-R BT.709
CompandingRec709 TransferFunction = func(e float64) float64 {
if e < beta {
return 4.5 * e
} else {
return alpha*math.Pow(e, 1/GammaRec709) - (alpha - 1)
}
}
PureRec709 TransferFunction = func(e float64) float64 {
return math.Pow(e, 1/GammaRec709)
}
PureRec709_22 TransferFunction = func(e float64) float64 {
return math.Pow(e, 1/Gamma22)
}
PureRec709_24 TransferFunction = func(e float64) float64 {
return math.Pow(e, 1/Gamma24)
}
CompandingRec2020 = CompandingRec709
PureRec2020 = PureRec709
PureRec2020_22 = PureRec709_22
PureRec2020_24 = PureRec709_24
)
type LUT []float64
func NewGammaLUT(gamma float64, bits int) (lut LUT) {
size := (1 << bits) - 1
lut = make(LUT, size+1)
for i := 0; i <= size; i++ {
lut[i] = math.Pow(float64(i)/float64(size), gamma)
}
return lut
}

28
colorspace/xyz.go Normal file
View file

@ -0,0 +1,28 @@
package colorspace
import "math"
// XYZSystem An absolute representation
type XYZSystem struct {
gamma float64
ToLinearLUT LUT
// ToLinear any adjustements on top of gamma
ToLinearTransfer TransferFunction
}
func (s XYZSystem) ToLinearFrom16(c uint16) float64 {
return s.ToLinearTransfer(s.ToLinearLUT[c])
}
func (s XYZSystem) ToLinear(c float64) float64 {
return s.ToLinearTransfer(math.Pow(c, s.gamma))
}
func NewXYZSystem(gamma float64, toLinearTransfer TransferFunction) XYZSystem {
return XYZSystem{
ToLinearLUT: NewGammaLUT(gamma, 16),
ToLinearTransfer: toLinearTransfer,
}
}
var DCIXYZSystem = NewXYZSystem(GammaDCIXYZ, TransferFromDCIXYZ)

35
colorspace/yuv.go Normal file
View file

@ -0,0 +1,35 @@
package colorspace
import "gonum.org/v1/gonum/mat"
type YCbCrConverter struct {
Kr, Kg, Kb float64
}
func (c YCbCrConverter) ConversionRGB() (to, from *mat.Dense) {
const half = 1. / 2.
RgbToYPbPr := mat.NewDense(3, 3, []float64{
c.Kr, c.Kg, c.Kb,
-half * (c.Kr / (1 - c.Kb)), -half * (c.Kg / (1 - c.Kb)), half,
half, -half * (c.Kg / (1 - c.Kr)), -half * (c.Kb / (1 - c.Kr)),
})
YPbPrToRgb := mat.NewDense(3, 3, []float64{
1, 0, 2 - 2*c.Kr,
1, -(c.Kb / c.Kg) * (2 - 2*c.Kb), -(c.Kr / c.Kg) * (2 - 2*c.Kr),
1, 2 - 2*c.Kb, 0,
})
return YPbPrToRgb, RgbToYPbPr
}
func NewYCbCrConverter(kr, kg, kb float64) YCbCrConverter {
return YCbCrConverter{
Kr: kr,
Kg: kg,
Kb: kb,
}
}
var YCbCr_Rec709 = NewYCbCrConverter(0.2126, 0.7152, 0.0722)
var YCbCr_Rec2020 = NewYCbCrConverter(0.2127, 0.6780, 0.0593)

103
conv/conv.c Normal file
View file

@ -0,0 +1,103 @@
#include "conv.h"
#include <math.h>
#include <stdlib.h>
const double uint16Max = (double)((1<<16)-1);
const double int16MaxPlusOne = (double)(1<<15);
const double* restrict xyz2rgb_g;
const double* restrict rgb2yuv_g;
double rgbGamma_g;
#if USE_SIMD
#include "conv_gen.h"
#endif
#if USE_SIMD && __AVX2__
#include "conv_avx.h"
#if USE_512_WIDE_PIPELINE && __AVX512F__ && __AVX512VL__ && __AVX512BW__
//512-bit wide pipeline
typed_converter_platform(x86_64, SIMD_PUMPS, float, 5, 1, load_packed_512f, store_packed_512f);
typed_converter_platform(x86_64, SIMD_PUMPS, double, 2, 1, load_packed_512, store_packed_512);
const char* DecoderInformation = "SIMD AVX-512 512-bit pipeline (2d 5f " str(SIMD_PUMPS) "pump)";
#else
#if !defined(__FMA__)
#define __FMA__ 0
const char* DecoderInformation = "SIMD AVX2 256-bit pipeline (1d 2f " str(SIMD_PUMPS) "pump)";
#else
const char* DecoderInformation = "SIMD AVX2 + FMA 256-bit pipeline (1d 2f " str(SIMD_PUMPS) "pump)";
#endif
//256-bit wide pipeline
typed_converter_platform(x86_64, SIMD_PUMPS, float, 2, __FMA__, load_packed_256f, store_packed_256f);
typed_converter_platform(x86_64, SIMD_PUMPS, double, 1, __FMA__, load_packed_256, store_packed_256);
#endif
#else
//No ASM defined
#define fType float
#include "conv_generic.h"
#undef fType
#define fType double
#include "conv_generic.h"
#undef USE_SIMD
#define USE_SIMD 0
const char* DecoderInformation = "Generic scalar pipeline (1d 1f 1pump)";
#endif
const char* decoder_information() {
return DecoderInformation;
}
void init(const double* restrict xyz2rgb, const double* restrict rgb2yuv, double xyzGamma, double rgbGamma) {
xyz2rgb_g = xyz2rgb;
rgb2yuv_g = rgb2yuv;
rgbGamma_g = 1./rgbGamma;
for (int i = 0; i < (1 << XYZ_LOOKUP_TABLE_SIZE); i++) {
xyz12_to_linear_double[i] = pow((double)(i)/((1 << XYZ_LOOKUP_TABLE_SIZE)-1), xyzGamma);
xyz12_to_linear_float[i] = xyz12_to_linear_double[i];
}
#if USE_SIMD
_load_matrix_double(xyz2rgb_mat_double, xyz2rgb);
_load_matrix_float(xyz2rgb_mat_float, xyz2rgb);
_load_matrix_double(rgb2yuv_mat_double, rgb2yuv);
_load_matrix_float(rgb2yuv_mat_float, rgb2yuv);
#endif
}
void convert_frame_dci_xyz12_to_yuv16_float(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) {
for (int i = 0; i < height; i++){
convert_line_dci_xyz12_to_yuv16_float(in, luma, cb, cr, width, height);
in += width*3;
luma += width;
cb += width;
cr += width;
}
}
void convert_frame_dci_xyz12_to_yuv16_double(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) {
for (int i = 0; i < height; i++){
convert_line_dci_xyz12_to_yuv16_double(in, luma, cb, cr, width, height);
in += width*3;
luma += width;
cb += width;
cr += width;
}
}

127
conv/conv.go Normal file
View file

@ -0,0 +1,127 @@
package conv
import (
"gonum.org/v1/gonum/mat"
"runtime"
"unsafe"
)
/*
#cgo CFLAGS: -march=native -mtune=native -Ofast -std=c99
#cgo LDFLAGS: -lm
#include "conv.h"
*/
import "C"
var matPinner runtime.Pinner
func DecoderInformation() string {
return C.GoString(C.decoder_information())
}
func InitData(xyz2rgb, rgb2yuv *mat.Dense, xyzGamma, rgbGamma float64) {
a := unsafe.Pointer(unsafe.SliceData(xyz2rgb.RawMatrix().Data))
b := unsafe.Pointer(unsafe.SliceData(rgb2yuv.RawMatrix().Data))
matPinner.Pin(a)
matPinner.Pin(b)
C.init((*C.double)(a), (*C.double)(b), C.double(xyzGamma), C.double(rgbGamma))
}
func ConvertFrameDCIXYZToYUV16Double(in []uint32, luma, cb, cr []uint16, width, height int) {
var pinner runtime.Pinner
inp := unsafe.Pointer(unsafe.SliceData(in))
lumap := unsafe.Pointer(unsafe.SliceData(luma))
cbp := unsafe.Pointer(unsafe.SliceData(cb))
crp := unsafe.Pointer(unsafe.SliceData(cr))
pinner.Pin(inp)
pinner.Pin(lumap)
pinner.Pin(cbp)
pinner.Pin(crp)
defer pinner.Unpin()
C.convert_frame_dci_xyz12_to_yuv16_double(
(*C.uint)(inp),
(*C.ushort)(lumap),
(*C.ushort)(cbp),
(*C.ushort)(crp),
C.int(width),
C.int(height),
)
}
func ConvertLineDCIXYZToYUV16Double(in []uint32, luma, cb, cr []uint16, width, height int) {
var pinner runtime.Pinner
inp := unsafe.Pointer(unsafe.SliceData(in))
lumap := unsafe.Pointer(unsafe.SliceData(luma))
cbp := unsafe.Pointer(unsafe.SliceData(cb))
crp := unsafe.Pointer(unsafe.SliceData(cr))
pinner.Pin(inp)
pinner.Pin(lumap)
pinner.Pin(cbp)
pinner.Pin(crp)
defer pinner.Unpin()
C.convert_line_dci_xyz12_to_yuv16_double(
(*C.uint)(inp),
(*C.ushort)(lumap),
(*C.ushort)(cbp),
(*C.ushort)(crp),
C.int(width),
C.int(height),
)
}
func ConvertFrameDCIXYZToYUV16Float(in []uint32, luma, cb, cr []uint16, width, height int) {
var pinner runtime.Pinner
inp := unsafe.Pointer(unsafe.SliceData(in))
lumap := unsafe.Pointer(unsafe.SliceData(luma))
cbp := unsafe.Pointer(unsafe.SliceData(cb))
crp := unsafe.Pointer(unsafe.SliceData(cr))
pinner.Pin(inp)
pinner.Pin(lumap)
pinner.Pin(cbp)
pinner.Pin(crp)
defer pinner.Unpin()
C.convert_frame_dci_xyz12_to_yuv16_float(
(*C.uint)(inp),
(*C.ushort)(lumap),
(*C.ushort)(cbp),
(*C.ushort)(crp),
C.int(width),
C.int(height),
)
}
func ConvertLineDCIXYZToYUV16Float(in []uint32, luma, cb, cr []uint16, width, height int) {
var pinner runtime.Pinner
inp := unsafe.Pointer(unsafe.SliceData(in))
lumap := unsafe.Pointer(unsafe.SliceData(luma))
cbp := unsafe.Pointer(unsafe.SliceData(cb))
crp := unsafe.Pointer(unsafe.SliceData(cr))
pinner.Pin(inp)
pinner.Pin(lumap)
pinner.Pin(cbp)
pinner.Pin(crp)
defer pinner.Unpin()
C.convert_line_dci_xyz12_to_yuv16_float(
(*C.uint)(inp),
(*C.ushort)(lumap),
(*C.ushort)(cbp),
(*C.ushort)(crp),
C.int(width),
C.int(height),
)
}

50
conv/conv.h Normal file
View file

@ -0,0 +1,50 @@
#include <stdint.h>
void init(const double* restrict xyz2rgb, const double* restrict rgb2yuv, double xyzGamma, double rgbGamma);
// Use available SIMD. Disable to enforce generic pipeline.
#if !defined(USE_SIMD)
#define USE_SIMD 1
#endif
// Opportunistically use AVX512 features even in 256-bit mode
#if !defined(USE_OPPORTUNISTIC_AVX512)
#define USE_OPPORTUNISTIC_AVX512 1
#endif
//AVX2
// double layout aaa0
// float layout aaabbb00
// Enable usage of 512-bit wide pipeline, pumping two pixels every iteration, if supported.
// Uses AVX-512 features. Requires AVX-512 F, VL, BW. Layout aaabbb00
// double layout aaabbb00
// float layout aaabbbcccdddeee0
#if !defined(USE_512_WIDE_PIPELINE)
#define USE_512_WIDE_PIPELINE 1
#endif
// Sets the number of pumps per iteration on pipeline. Supported 1, 2, 4, 8
// Set this if your architecture has large amount of executors than normal. Recommended to stay at 2 or 4.
#if !defined(SIMD_PUMPS)
#define SIMD_PUMPS 1
#endif
// ExpandLoad or CompressStore is slower than currently doing set(vals...). todo: inspect with newer CPUs than ZEN4
#if !defined(USE_AVX512_EXPANDLOAD)
#define USE_AVX512_EXPANDLOAD 0
#endif
// Size of the lookup table. Only valid value is 12.
#if !defined(XYZ_LOOKUP_TABLE_SIZE)
#define XYZ_LOOKUP_TABLE_SIZE 12
#endif
// This function can run out of bounds slightly, about 24 bytes per pump per line (XYZ) or 8 bytes per pump per line (YUV), caller should allocates extra input/output buffer for this reason.
void convert_line_dci_xyz12_to_yuv16_float(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
void convert_frame_dci_xyz12_to_yuv16_float(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
void convert_line_dci_xyz12_to_yuv16_double(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
void convert_frame_dci_xyz12_to_yuv16_double(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
const char* decoder_information();

271
conv/conv_avx.h Normal file
View file

@ -0,0 +1,271 @@
#include <immintrin.h>
#include <stdint.h>
// x86_64 AVX2 and AVX512 definitions
#define _perm_component_i(i, c, v) ((i*v+c)*2)
#define _perm_component2(c) _perm_component_i(1, c, 4), _perm_component_i(0, c, 4)
#define _perm_component5(c) _perm_component_i(4, c, 3), _perm_component_i(3, c, 3), _perm_component_i(2, c, 3), _perm_component_i(1, c, 3), _perm_component_i(0, c, 3)
#if USE_512_WIDE_PIPELINE && __AVX512F__ && __AVX512VL__ && __AVX512BW__ && __AVX512DQ__
__m256i load_packed_512(int j, const uint32_t* restrict in) {
#if USE_AVX512_EXPANDLOAD
//todo: this path is slower than _mm256_set_epi32, both mask and maskz ZEN4
return _mm256_maskz_expandloadu_epi32(_cvtu32_mask8(0b01110111), in+j*3);
#else
return _mm256_permutexvar_epi32(_mm256_set_epi32(7, 5, 4, 3, 7, 2, 1, 0), _mm256_maskz_loadu_epi32(_cvtu32_mask8(0b00111111), in+j*3));
#endif
}
void store_packed_512(__m256i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
#define _perm_idx512 _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _perm_component2(2), _perm_component2(1), _perm_component2(0))
__m128i store = _mm256_castsi256_si128(_mm256_permutexvar_epi16(_perm_idx512, packed));
_mm_mask_storeu_epi16(&luma[j], _cvtu32_mask8(0b00000011), store);
_mm_mask_storeu_epi16(&cb[j-2], _cvtu32_mask8(0b00001100), store);
_mm_mask_storeu_epi16(&cr[j-4], _cvtu32_mask8(0b00110000), store);
}
__m512i load_packed_512f(int j, const uint32_t* restrict in) {
return _mm512_maskz_loadu_epi32(_cvtu32_mask16(0b0111111111111111), in+j*3);
}
void store_packed_512f(__m512i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
#define _perm_idx512f _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _perm_component5(2), _perm_component5(1), _perm_component5(0))
__m256i store = _mm512_castsi512_si256(_mm512_permutexvar_epi16(_perm_idx512f, packed));
_mm256_mask_storeu_epi16(&luma[j ], _cvtu32_mask16(0b0000000000011111), store);
_mm256_mask_storeu_epi16(&cb[j -5], _cvtu32_mask16(0b0000001111100000), store);
_mm256_mask_storeu_epi16(&cr[j-10], _cvtu32_mask16(0b0111110000000000), store);
}
#define SSE 1
#define AVX 1
#define AVX512 1
#include "simd_utils/simd_utils.h"
static inline __m512d pow512_pd1(__m512d x, double y1) {
const __m512d y = _mm512_set1_pd(y1);
return exp512_pd(_mm512_mul_pd(y, log512_pd(x)));
}
static inline __m512 pow512_ps1(__m512 x, double y1) {
const __m512d y = _mm512_set1_pd(y1);
//low precision cause issues, use doubles
__m256 a = _mm512_cvtpd_ps(exp512_pd(_mm512_mul_pd(y, log512_pd(_mm512_cvtps_pd(_mm512_castps512_ps256(x))))));
__m256 b = _mm512_cvtpd_ps(exp512_pd(_mm512_mul_pd(y, log512_pd(_mm512_cvtps_pd(_mm512_extractf32x8_ps(x, 1))))));
return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
}
#else
__m128i load_packed_256(int j, const uint32_t* restrict in) {
#if USE_OPPORTUNISTIC_AVX512 && __AVX512F__ && __AVX512VL__ && __AVX512DQ__
return _mm_maskz_loadu_epi32(_cvtu32_mask8(0b00000111), in+j*3);
#else
return _mm_loadu_epi32(in+j*3);
//return _mm_set_epi32(0, inz[j], iny[j], inx[j]);
#endif
}
void store_packed_256(__m128i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
luma[j] = ((int32_t*)(&packed))[0];
cb[j] = ((int32_t*)(&packed))[1];
cr[j] = ((int32_t*)(&packed))[2];
}
/*#if USE_DOT_PRODUCT
//position entries properly in vectors ready for dot product
m[0] = _mm256_set_ps(0, in[2], in[1], in[0], 0, in[2], in[1], in[0]); //0rrr 0rrr
m[1] = _mm256_set_ps(0, in[5], in[4], in[3], 0, in[5], in[4], in[3]); //0ggg 0ggg
m[2] = _mm256_set_ps(0, in[8], in[7], in[6], 0, in[8], in[7], in[6]); //0bbb 0bbb
__m256 a = _mm256_dp_ps(v, m0, 0b01110001);
__m256 b = _mm256_dp_ps(v, m1, 0b01110010);
__m256 c = _mm256_dp_ps(v, m2, 0b01110100);
return _mm256_blend_ps(_mm256_blend_ps(a, b, 0b00100010), c, 0b01000100);
*/
__m256i load_packed_256f(int j, const uint32_t* restrict in) {
#if USE_OPPORTUNISTIC_AVX512 && USE_AVX512_EXPANDLOAD && __AVX512VBMI2__ && __AVX512VL__ && __AVX512F__
//todo: slow ZEN4
return _mm256_maskz_expandloadu_epi32(_cvtu32_mask8(0b01110111), in+j*3);
#else
#if USE_OPPORTUNISTIC_AVX512 && __AVX512F__ && __AVX512BW__ && __AVX512VL__ && __AVX512DQ__
return _mm256_permutexvar_epi32(_mm256_set_epi32(7, 5, 4, 3, 7, 2, 1, 0), _mm256_maskz_loadu_epi32(_cvtu32_mask8(0b00111111), in+j*3));
#else
return _mm256_set_epi32(0, in[j+5], in[j+4], in[j+3], 0, in[j+2], in[j+1], in[j]);
#endif
#endif
}
void store_packed_256f(__m256i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
#if USE_OPPORTUNISTIC_AVX512 && __AVX512BW__ && __AVX512VL__ && __AVX512DQ__
#define _perm_idx256 _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _perm_component2(2), _perm_component2(1), _perm_component2(0))
__m128i store = _mm256_castsi256_si128(_mm256_permutexvar_epi16(_perm_idx256, packed));
_mm_mask_storeu_epi16(&luma[j], _cvtu32_mask8(0b00000011), store);
_mm_mask_storeu_epi16(&cb[j-2], _cvtu32_mask8(0b00001100), store);
_mm_mask_storeu_epi16(&cr[j-4], _cvtu32_mask8(0b00110000), store);
#else
luma[j/3] = ((int32_t*)(&packed))[0];
cb[j/3] = ((int32_t*)(&packed))[1];
cr[j/3] = ((int32_t*)(&packed))[2];
luma[j/3+1] = ((int32_t*)(&packed))[4];
cb[j/3+1] = ((int32_t*)(&packed))[5];
cr[j/3+1] = ((int32_t*)(&packed))[6];
#endif
}
#define SSE 1
#define AVX 1
#include "simd_utils/simd_utils.h"
static inline __m256d pow256_pd1(__m256d x, double y1) {
const __m256d y = _mm256_set1_pd(y1);
return exp256_pd(_mm256_mul_pd(y, log256_pd(x)));
}
static inline __m256 pow256_ps1(__m256 x, double y1) {
const __m256d y = _mm256_set1_pd(y1);
//low precision cause issues, use doubles
__m128 a = _mm256_cvtpd_ps(exp256_pd(_mm256_mul_pd(y, log256_pd(_mm256_cvtps_pd(_mm256_castps256_ps128(x))))));
__m128 b = _mm256_cvtpd_ps(exp256_pd(_mm256_mul_pd(y, log256_pd(_mm256_cvtps_pd(_mm256_extractf128_ps(x, 1))))));
return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
}
#endif
#define typed_vector_size_x86_64_float_5 512
#define typed_vector_size_x86_64_double_2 512
#define typed_vector_size_x86_64_float_2 256
#define typed_vector_size_x86_64_double_1 256
#define typed_permute_lanes_x86_64_float_5 _permutexvar_
#define typed_permute_lanes_x86_64_double_2 _permutex_
#define typed_permute_lanes_x86_64_float_2 _permute_
#define typed_permute_lanes_x86_64_double_1 _permute4x64_
inline __attribute__((always_inline)) __m512 _int_i32gather_ps(void const* base_addr, __m512i vindex, int scale) {
switch(scale){
case 1:
return _mm512_i32gather_ps(vindex, base_addr, 1);
case 2:
return _mm512_i32gather_ps(vindex, base_addr, 2);
case 4:
return _mm512_i32gather_ps(vindex, base_addr, 4);
case 8:
return _mm512_i32gather_ps(vindex, base_addr, 8);
default:
__builtin_unreachable();
}
}
inline __attribute__((always_inline)) __m512d _int_i32gather_pd(void const* base_addr, __m256i vindex, int scale) {
switch(scale){
case 1:
return _mm512_i32gather_pd(vindex, base_addr, 1);
case 2:
return _mm512_i32gather_pd(vindex, base_addr, 2);
case 4:
return _mm512_i32gather_pd(vindex, base_addr, 4);
case 8:
return _mm512_i32gather_pd(vindex, base_addr, 8);
default:
__builtin_unreachable();
}
}
#define typed_pow1_x86_64_float_5 pow512_ps1
#define typed_pow1_x86_64_double_2 pow512_pd1
#define typed_pow1_x86_64_float_2 pow256_ps1
#define typed_pow1_x86_64_double_1 pow256_pd1
#define typed_pow1_x86_64(floatType, elementCount) typed_pow1_x86_64_##floatType##_##elementCount
#define typed_i32gather_x86_64_float_5 _int_i32gather_ps
#define typed_i32gather_x86_64_double_2 _int_i32gather_pd
#define typed_i32gather_x86_64_float_2 _mm256_i32gather_ps
#define typed_i32gather_x86_64_double_1 _mm256_i32gather_pd
#define typed_i32gather_x86_64(floatType, elementCount) typed_i32gather_x86_64_##floatType##_##elementCount
#define typed_vector_func_prefix_x86_64 _mm
#define typed_vector_type_prefix_x86_64 __m
#define typed_vector_type_suffix_x86_64_double d
#define typed_vector_type_suffix_x86_64_int i
#define typed_vector_int_size_x86_64_float_5 512
#define typed_vector_int_func_x86_64_float_5 concat(typed_vector_func_prefix_x86_64, typed_vector_int_size_x86_64_float_5)
#define typed_vector_int_size_x86_64_double_2 256
#define typed_vector_int_func_x86_64_double_2 concat(typed_vector_func_prefix_x86_64, typed_vector_int_size_x86_64_double_2)
#define typed_vector_int_size_x86_64_float_2 256
#define typed_vector_int_func_x86_64_float_2 concat(typed_vector_func_prefix_x86_64, typed_vector_int_size_x86_64_float_2)
#define typed_vector_int_size_x86_64_double_1 128
#define typed_vector_int_func_x86_64_double_1 typed_vector_func_prefix_x86_64
#define typed_vector_func_type_x86_64_float ps
#define typed_vector_func_type_x86_64_double pd
#define typed_vector_type_x86_64_float(elementCount) concat(typed_vector_type_prefix_x86_64, typed_vector_size_x86_64_float_##elementCount)
#define typed_vector_type_x86_64_double(elementCount) concat3(typed_vector_type_prefix_x86_64, typed_vector_size_x86_64_double_##elementCount, typed_vector_type_suffix_x86_64_double)
#define typed_vector_type_x86_64(floatType, elementCount) typed_vector_type_x86_64_##floatType(elementCount)
#define typed_vector_int_type_x86_64(floatType, elementCount) concat3(typed_vector_type_prefix_x86_64, typed_vector_int_size_x86_64_##floatType##_##elementCount, typed_vector_type_suffix_x86_64_int)
#define typed_vector_func_x86_64_float(elementCount) concat(typed_vector_func_prefix_x86_64, typed_vector_size_x86_64_float_##elementCount)
#define typed_vector_func_x86_64_double(elementCount) concat(typed_vector_func_prefix_x86_64, typed_vector_size_x86_64_double_##elementCount)
#define typed_vector_func_x86_64(floatType, elementCount) typed_vector_func_x86_64_##floatType(elementCount)
#define typed_permute_lanes_x86_64_float_5 _permutexvar_
#define typed_permute_lanes_x86_64_double_2 _permutex_
#define typed_permute_lanes_x86_64_float_2 _permute_
#define typed_permute_lanes_x86_64_double_1 _permute4x64_
#define typed_fmadd_x86_64 _fmadd_
#define typed_add_x86_64 _add_
#define typed_mul_x86_64 _mul_
#define typed_set1_x86_64 _set1_
#define typed_set_x86_64 _set_
#define typed_min_x86_64 _min_
#define typed_max_x86_64 _max_
#define typed_seti_x86_64 _set_epi32
#define typed_addi_x86_64 _add_epi32
#define typed_cvt_x86_64_float _cvtps_epi32
#define typed_cvt_x86_64_double _cvtpd_epi32
#define typed_func_x86_64_fmadd(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_fmadd_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_add(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_add_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_mul(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_mul_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_set1(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_set1_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_set(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_set_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_min(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_min_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_max(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_max_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_ftoi(floatType, elementCount) concat(typed_vector_func_x86_64(floatType, elementCount), typed_cvt_x86_64_##floatType)
#define typed_func_x86_64_seti(floatType, elementCount) concat(typed_vector_int_func_x86_64_##floatType##_##elementCount, typed_seti_x86_64)
#define typed_func_x86_64_addi(floatType, elementCount) concat(typed_vector_int_func_x86_64_##floatType##_##elementCount, typed_addi_x86_64)
#define typed_func_x86_64_pow1(floatType, elementCount) typed_pow1_x86_64(floatType, elementCount)
#define typed_func_x86_64_i32gather(floatType, elementCount) typed_i32gather_x86_64(floatType, elementCount)
#define typed_func_x86_64_permute_lanes(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_permute_lanes_x86_64_##floatType##_##elementCount, typed_vector_func_type_x86_64_##floatType)
#define typed_permute_lanes_x86_64_float_5 _permutexvar_
#define typed_permute_lanes_x86_64_double_2 _permutex_
#define typed_permute_lanes_x86_64_float_2 _permute_
#define typed_permute_lanes_x86_64_double_1 _permute4x64_

420
conv/conv_gen.h Normal file
View file

@ -0,0 +1,420 @@
#define _lit(s) s
#define lit(s) _lit(s)
#define _str(s) #s
#define str(s) _str(s)
#define _concat(a,b) a##b
#define concat(a,b) _concat(a,b)
#define _concat3(a,b,c) a##b##c
#define concat3(a,b,c) _concat3(a,b,c)
#if XYZ_LOOKUP_TABLE_SIZE==12
#define typed_gather_double(i32gather, packed) i32gather(xyz12_to_linear_double, packed, 8)
#define typed_gather_float(i32gather, packed) i32gather(xyz12_to_linear_float, packed, 4)
#else
#error "Not supported"
#endif
#define _shuf_lane_step1(l) 3*l, 3*l+2, 3*l+1
#define _shuf_lane_step2(l) 3*l+1, 3*l, 3*l+2
#define _shuf_idx(seti, step) seti(0xf, _shuf_lane_step##step(4), _shuf_lane_step##step(3), _shuf_lane_step##step(2), _shuf_lane_step##step(1), _shuf_lane_step##step(0))
// xxxxxyyyyyzzzzz0
#define typed_mul_vec_dot_5_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
fmadd( \
permuteLanesVar(_shuf_idx(seti, 2), v), \
m2, \
fmadd( \
permuteLanesVar(_shuf_idx(seti, 1), v), \
m1, \
mul(v, m0) \
) \
)
#define typed_mul_vec_dot_2_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
fmadd( \
permuteLanes(v, 0b11010010), \
m2, \
fmadd( \
permuteLanes(v, 0b11001001), \
m1, \
mul(v, m0) \
) \
)
#define typed_mul_vec_dot_2_fma_0(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
add( \
add( \
mul(v, m0), \
mul(permuteLanes(v, 0b11001001), m1) \
), \
mul(permuteLanes(v, 0b11010010), m2) \
)
#define typed_mul_vec_dot_1_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
typed_mul_vec_dot_2_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v)
#define typed_mul_vec_dot_1_fma_0(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
typed_mul_vec_dot_2_fma_0(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v)
#define typed_mul_vec_dot_1(fma, fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
typed_mul_vec_dot_1_fma_##fma(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v)
#define typed_mul_vec_dot_2(fma, fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
typed_mul_vec_dot_2_fma_##fma(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v)
#define typed_mul_vec_dot_5(fma, fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
typed_mul_vec_dot_5_fma_##fma(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v)
#define typed_yuv_add_5(seti) \
seti(0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0)
#define typed_yuv_add_2(seti) \
seti(0, int16MaxPlusOne, int16MaxPlusOne, 0, 0, int16MaxPlusOne, int16MaxPlusOne, 0)
#define typed_yuv_add_1(seti) \
seti(0, int16MaxPlusOne, int16MaxPlusOne, 0)
#define typed_mat_load_func_5(setf, m, in) \
m[0] = setf(0, in[8], in[4], in[0], in[8], in[4], in[0], in[8], in[4], in[0], in[8], in[4], in[0], in[8], in[4], in[0]); /*0 bgr bgr bgr bgr bgr*/ \
m[1] = setf(0, in[6], in[5], in[1], in[6], in[5], in[1], in[6], in[5], in[1], in[6], in[5], in[1], in[6], in[5], in[1]); /*0 rbg rbg rbg rbg rbg*/ \
m[2] = setf(0, in[7], in[3], in[2], in[7], in[3], in[2], in[7], in[3], in[2], in[7], in[3], in[2], in[7], in[3], in[2]); /*0 grb grb grb grb grb*/
#define typed_mat_load_func_2(setf, m, in) \
m[0] = setf(0, in[8], in[4], in[0], 0, in[8], in[4], in[0]); /* 0bgr 0bgr*/ \
m[1] = setf(0, in[6], in[5], in[1], 0, in[6], in[5], in[1]); /* 0rbg 0rbg*/ \
m[2] = setf(0, in[7], in[3], in[2], 0, in[7], in[3], in[2]); /* 0grb 0grb*/
#define typed_mat_load_func_1(setf, m, in) \
m[0] = setf(0, in[8], in[4], in[0]); /* 0bgr*/ \
m[1] = setf(0, in[6], in[5], in[1]); /* 0rbg*/ \
m[2] = setf(0, in[7], in[3], in[2]); /* 0grb*/
// todo: optimize pow via https://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent ?
#define _load_matrix_func _load_matrix_
#define _line_func convert_line_dci_xyz12_to_yuv16_
//concat(_line_func, floatType)
#define typed_converter_pump1(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
\
void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
typed_mat_load_func_##elementCount(setf, m, in) \
}; \
\
\
inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
iPackedType packed; \
fVectorType xyz; \
fVectorType rgb; \
fVectorType yuv; \
\
const fVectorType minValue = set1f(1.); \
const fVectorType maxValue = set1f(0.); \
const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
\
const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
\
const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
\
for (int j = 0; j < width; j += elementCount) { \
packed = packedLoadFunc(j, in); \
\
xyz = typed_gather_##floatType(i32gather, packed); \
\
rgb = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz); \
\
rgb = maxf(minf(rgb, minValue), maxValue); \
\
rgb = fpow(rgb, rgbGamma_g); \
\
yuv = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb); \
\
packed = addi(ftoi(yuv), yuv2packed_add); \
\
packedStoreFunc(packed, j, luma, cb, cr); \
} \
};
#define typed_converter_pump2(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
\
void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
typed_mat_load_func_##elementCount(setf, m, in) \
}; \
\
\
inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
iPackedType packed0, packed1; \
fVectorType xyz0, xyz1; \
fVectorType rgb0, rgb1; \
fVectorType yuv0, yuv1; \
\
const fVectorType minValue = set1f(1.); \
const fVectorType maxValue = set1f(0.); \
const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
\
const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
\
const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
\
for (int j = 0; j < width; j += elementCount*2) { \
packed0 = packedLoadFunc(j, in); \
packed1 = packedLoadFunc(j+elementCount, in); \
\
xyz0 = typed_gather_##floatType(i32gather, packed0); \
xyz1 = typed_gather_##floatType(i32gather, packed1); \
\
rgb0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz0); \
rgb1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz1); \
\
rgb0 = maxf(minf(rgb0, minValue), maxValue); \
rgb1 = maxf(minf(rgb1, minValue), maxValue); \
\
rgb0 = fpow(rgb0, rgbGamma_g); \
rgb1 = fpow(rgb1, rgbGamma_g); \
\
yuv0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb0); \
yuv1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb1); \
\
packed0 = addi(ftoi(yuv0), yuv2packed_add); \
packed1 = addi(ftoi(yuv1), yuv2packed_add); \
\
packedStoreFunc(packed0, j, luma, cb, cr); \
packedStoreFunc(packed1, j+elementCount, luma, cb, cr); \
} \
};
#define typed_converter_pump4(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
\
void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
typed_mat_load_func_##elementCount(setf, m, in) \
}; \
\
\
inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
iPackedType packed0, packed1, packed2, packed3; \
fVectorType xyz0, xyz1, xyz2, xyz3; \
fVectorType rgb0, rgb1, rgb2, rgb3; \
fVectorType yuv0, yuv1, yuv2, yuv3; \
\
const fVectorType minValue = set1f(1.); \
const fVectorType maxValue = set1f(0.); \
const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
\
const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
\
const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
\
for (int j = 0; j < width; j += elementCount*4) { \
packed0 = packedLoadFunc(j, in); \
packed1 = packedLoadFunc(j+elementCount, in); \
packed2 = packedLoadFunc(j+elementCount*2, in); \
packed3 = packedLoadFunc(j+elementCount*3, in); \
\
xyz0 = typed_gather_##floatType(i32gather, packed0); \
xyz1 = typed_gather_##floatType(i32gather, packed1); \
xyz2 = typed_gather_##floatType(i32gather, packed2); \
xyz3 = typed_gather_##floatType(i32gather, packed3); \
\
rgb0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz0); \
rgb1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz1); \
rgb2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz2); \
rgb3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz3); \
\
rgb0 = maxf(minf(rgb0, minValue), maxValue); \
rgb1 = maxf(minf(rgb1, minValue), maxValue); \
rgb2 = maxf(minf(rgb2, minValue), maxValue); \
rgb3 = maxf(minf(rgb3, minValue), maxValue); \
\
rgb0 = fpow(rgb0, rgbGamma_g); \
rgb1 = fpow(rgb1, rgbGamma_g); \
rgb2 = fpow(rgb2, rgbGamma_g); \
rgb3 = fpow(rgb3, rgbGamma_g); \
\
yuv0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb0); \
yuv1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb1); \
yuv2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb2); \
yuv3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb3); \
\
packed0 = addi(ftoi(yuv0), yuv2packed_add); \
packed1 = addi(ftoi(yuv1), yuv2packed_add); \
packed2 = addi(ftoi(yuv2), yuv2packed_add); \
packed3 = addi(ftoi(yuv3), yuv2packed_add); \
\
packedStoreFunc(packed0, j, luma, cb, cr); \
packedStoreFunc(packed1, j+elementCount, luma, cb, cr); \
packedStoreFunc(packed2, j+elementCount*2, luma, cb, cr); \
packedStoreFunc(packed3, j+elementCount*3, luma, cb, cr); \
} \
};
#define typed_converter_pump8(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
\
void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
typed_mat_load_func_##elementCount(setf, m, in) \
}; \
\
\
inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
iPackedType packed0, packed1, packed2, packed3, packed4, packed5, packed6, packed7; \
fVectorType xyz0, xyz1, xyz2, xyz3, xyz4, xyz5, xyz6, xyz7; \
fVectorType rgb0, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7; \
fVectorType yuv0, yuv1, yuv2, yuv3, yuv4, yuv5, yuv6, yuv7; \
\
const fVectorType minValue = set1f(1.); \
const fVectorType maxValue = set1f(0.); \
const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
\
const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
\
const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
\
for (int j = 0; j < width; j += elementCount*8) { \
packed0 = packedLoadFunc(j, in); \
packed1 = packedLoadFunc(j+elementCount, in); \
packed2 = packedLoadFunc(j+elementCount*2, in); \
packed3 = packedLoadFunc(j+elementCount*3, in); \
packed4 = packedLoadFunc(j+elementCount*4, in); \
packed5 = packedLoadFunc(j+elementCount*5, in); \
packed6 = packedLoadFunc(j+elementCount*6, in); \
packed7 = packedLoadFunc(j+elementCount*7, in); \
\
xyz0 = typed_gather_##floatType(i32gather, packed0); \