Initial commit

This commit is contained in:
DataHoarder 2024-02-19 20:00:08 +01:00
commit 6eda53859e
Signed by: DataHoarder
SSH key fingerprint: SHA256:OLTRf6Fl87G52SiR7sWLGNzlJt4WOX+tfI2yxo0z7xk
35 changed files with 2442 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/bin/*

3
.gitmodules vendored Normal file
View file

@ -0,0 +1,3 @@
[submodule "conv/simd_utils"]
path = conv/simd_utils
url = https://github.com/JishinMaster/simd_utils.git

19
LICENSE Normal file
View file

@ -0,0 +1,19 @@
Copyright (c) 2024 WeebDataHoarder, xyz2yuv Contributors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

11
README.md Normal file
View file

@ -0,0 +1,11 @@
# xyz2yuv tool
Decode DCI XYZ' and apply conversions to your desired output colorspace. Supports Rec. 709 and Rec. 2020, with adjustable precision and gamma values.
Supports AVX-512 and AVX2 targets, and a generic implementation in C and Go as well.
## Dependencies
* CGO
* libopenjp2-dev
* libavformat-dev
* libavcodec-dev
* libavutil-dev

0
bin/.gitkeep Normal file
View file

14
build.sh Executable file
View file

@ -0,0 +1,14 @@
#!/bin/bash
cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
CMD=go
if [[ "${GOROOT}" != "" ]]; then
CMD="${GOROOT}/bin/go"
fi
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv ./
CGO_CFLAGS="-DSIMD_PUMPS=2" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv_2pump ./
CGO_CFLAGS="-DSIMD_PUMPS=4" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv_4pump ./
CGO_CFLAGS="-DSIMD_PUMPS=8" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv_8pump ./

53
colorspace/adaptation.go Normal file
View file

@ -0,0 +1,53 @@
package colorspace
import "gonum.org/v1/gonum/mat"
type ChromaticAdaptation mat.Dense
func (a ChromaticAdaptation) AdaptXYZ(from, to Illuminant) mat.Matrix {
var fromM, toM *mat.VecDense
{
Xw, Yw, Zw := from.ToXYZ()
fromM = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
}
{
Xw, Yw, Zw := to.ToXYZ()
toM = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
}
var crdFrom, crdTo mat.VecDense
crdFrom.MulVec((*mat.Dense)(&a), fromM)
crdTo.MulVec((*mat.Dense)(&a), toM)
return mat.NewDiagDense(3, []float64{
crdTo.AtVec(0) / crdFrom.AtVec(0),
crdTo.AtVec(1) / crdFrom.AtVec(1),
crdTo.AtVec(2) / crdFrom.AtVec(2),
})
}
var (
ChromaticAdaptationBradford = mat.NewDense(3, 3, []float64{
0.8951, 0.2664, -0.1614,
-0.7502, 1.7135, 0.0367,
0.0389, -0.0685, 1.0296,
})
ChromaticAdaptationCMCCAT2000 = mat.NewDense(3, 3, []float64{
0.7982, 0.3389, -0.1371,
-0.5918, 1.5512, 0.0406,
0.0008, 0.0239, 0.9753,
})
ChromaticAdaptationCIECAT02 = mat.NewDense(3, 3, []float64{
0.7328, 0.4296, -0.1624,
-0.7036, 1.6975, 0.0061,
0.0030, 0.0136, 0.9834,
})
ChromaticAdaptationSharp = mat.NewDense(3, 3, []float64{
1.2694, -0.0988, -0.1706,
-0.8364, 1.8006, 0.0357,
0.0297, -0.0315, 1.0018,
})
)

145
colorspace/chromaticity.go Normal file
View file

@ -0,0 +1,145 @@
package colorspace
import "gonum.org/v1/gonum/mat"
type Chromaticity struct {
Red ColorCoordinate
Green ColorCoordinate
Blue ColorCoordinate
White Illuminant
}
func (c Chromaticity) ConversionXYZ() (to, from *mat.Dense) {
var err error
var RGB *mat.Dense
var W *mat.VecDense
{
Xr, Yr, Zr := c.Red.ToXYZ()
Xg, Yg, Zg := c.Green.ToXYZ()
Xb, Yb, Zb := c.Blue.ToXYZ()
RGB = mat.NewDense(3, 3, []float64{
Xr, Xg, Xb,
Yr, Yg, Yb,
Zr, Zg, Zb,
})
}
{
Xw, Yw, Zw := c.White.ToXYZ()
W = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
}
var tmp1, rgb2xyz, xyz2rgb mat.Dense
var S mat.VecDense
if err = tmp1.Inverse(RGB); err != nil {
panic(err)
}
S.MulVec(&tmp1, W)
M := mat.NewDense(3, 3, []float64{
S.AtVec(0) * RGB.At(0, 0), S.AtVec(1) * RGB.At(0, 1), S.AtVec(2) * RGB.At(0, 2),
S.AtVec(0) * RGB.At(1, 0), S.AtVec(1) * RGB.At(1, 1), S.AtVec(2) * RGB.At(1, 2),
S.AtVec(0) * RGB.At(2, 0), S.AtVec(1) * RGB.At(2, 1), S.AtVec(2) * RGB.At(2, 2),
})
rgb2xyz.CloneFrom(M)
if err = xyz2rgb.Inverse(M); err != nil {
panic(err)
}
return &rgb2xyz, &xyz2rgb
}
/*
func (c Chromaticity) XYZToRGB(connectionSpaceWhite Illuminant, adaptation ChromaticAdaptation) {
var err error
var RGB *mat.Dense
var W1, W2 *mat.VecDense
{
Xr, Yr, Zr := c.Red.ToXYZ()
Xg, Yg, Zg := c.Green.ToXYZ()
Xb, Yb, Zb := c.Blue.ToXYZ()
RGB = mat.NewDense(3, 3, []float64{
Xr, Xg, Xb,
Yr, Yg, Yb,
Zr, Zg, Zb,
})
}
{
Xw, Yw, Zw := c.White.ToXYZ()
W1 = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
}
{
Xw, Yw, Zw := connectionSpaceWhite.ToXYZ()
W2 = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
}
var tmp1, tmp2, M, Mc, M2, rgb2xyz, xyz2rgb, source_destination_whites, destination_source_whites, adapted_rgb2xyz_2, adapted_xyz2rgb_2 mat.Dense
var S, crdS, crdD, RA mat.VecDense
if err = tmp1.Inverse(RGB); err != nil {
panic(err)
}
S.MulVec(&tmp1, W1)
//TODO
M.Mul(&S, RGB)
rgb2xyz.CloneFrom(&M)
if err = xyz2rgb.Inverse(&M); err != nil {
panic(err)
}
// chromatic adaptation
crdS.MulVec((*mat.Dense)(&adaptation), W1)
crdD.MulVec((*mat.Dense)(&adaptation), W2)
Mt := mat.NewDiagDense(3, []float64{
crdD.AtVec(0) / crdS.AtVec(0),
crdD.AtVec(1) / crdS.AtVec(1),
crdD.AtVec(2) / crdS.AtVec(2),
})
if err = tmp1.Inverse((*mat.Dense)(&adaptation)); err != nil {
panic(err)
}
tmp2.Mul(&tmp1, Mt)
Mc.Mul(&tmp2, (*mat.Dense)(&adaptation))
source_destination_whites.CloneFrom(&Mc)
if err = destination_source_whites.Inverse(&Mc); err != nil {
panic(err)
}
M2.Mul(&Mc, &M)
adapted_rgb2xyz_2.CloneFrom(&M2)
if err = adapted_xyz2rgb_2.Inverse(&M2); err != nil {
panic(err)
}
RA.MulVec(&Mc, W1)
}
*/

18
colorspace/color.go Normal file
View file

@ -0,0 +1,18 @@
package colorspace
type ColorCoordinate [2]float64
func (c ColorCoordinate) X() float64 {
return c[0]
}
func (c ColorCoordinate) Y() float64 {
return c[1]
}
func (c ColorCoordinate) ToXYZ() (X, Y, Z float64) {
X = c[0] / c[1]
Y = 1.0
Z = (1 - c[0] - c[1]) / c[1]
return
}

18
colorspace/illuminant.go Normal file
View file

@ -0,0 +1,18 @@
package colorspace
type Illuminant = ColorCoordinate
// Standard Illuminants in 2 degree form
var (
IlluminantD50 = Illuminant{0.34567, 0.35850}
IlluminantD55 = Illuminant{0.33242, 0.34743}
// IlluminantD60 P3-D60 (ACES Cinema)
IlluminantD60 = Illuminant{0.32168, 0.33767}
// IlluminantD63 P3-DCI (Theater)
IlluminantD63 = Illuminant{0.314, 0.351}
// IlluminantD65 Standard D65 for Rec. 709, Rec. 2020, sRGB and many more
IlluminantD65 = Illuminant{0.31271, 0.32902}
)

79
colorspace/relative.go Normal file
View file

@ -0,0 +1,79 @@
package colorspace
type RelativeSystem struct {
Chromaticity Chromaticity
fromLinearTransfer TransferFunction
YCbCr YCbCrConverter
}
func (s RelativeSystem) FromLinear(c float64) float64 {
return s.fromLinearTransfer(c)
}
func NewRelativeSystem(chromaticity Chromaticity, fromLinearTransfer TransferFunction, converter YCbCrConverter) RelativeSystem {
return RelativeSystem{
Chromaticity: chromaticity,
fromLinearTransfer: fromLinearTransfer,
YCbCr: converter,
}
}
var (
SystemSRGB = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.640, 0.330},
Green: ColorCoordinate{0.300, 0.600},
Blue: ColorCoordinate{0.150, 0.060},
White: IlluminantD65,
}, CompandingSRGB, YCbCr_Rec709)
SystemRec709 = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.640, 0.330},
Green: ColorCoordinate{0.300, 0.600},
Blue: ColorCoordinate{0.150, 0.060},
White: IlluminantD65,
}, CompandingRec709, YCbCr_Rec709)
SystemRec709_Pure = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.640, 0.330},
Green: ColorCoordinate{0.300, 0.600},
Blue: ColorCoordinate{0.150, 0.060},
White: IlluminantD65,
}, PureRec709, YCbCr_Rec709)
SystemRec709_Pure22 = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.640, 0.330},
Green: ColorCoordinate{0.300, 0.600},
Blue: ColorCoordinate{0.150, 0.060},
White: IlluminantD65,
}, PureRec709_22, YCbCr_Rec709)
SystemRec709_Pure24 = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.640, 0.330},
Green: ColorCoordinate{0.300, 0.600},
Blue: ColorCoordinate{0.150, 0.060},
White: IlluminantD65,
}, PureRec709_24, YCbCr_Rec709)
SystemRec2020 = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.708, 0.292},
Green: ColorCoordinate{0.170, 0.797},
Blue: ColorCoordinate{0.131, 0.046},
White: IlluminantD65,
}, CompandingRec2020, YCbCr_Rec2020)
SystemRec2020_Pure = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.708, 0.292},
Green: ColorCoordinate{0.170, 0.797},
Blue: ColorCoordinate{0.131, 0.046},
White: IlluminantD65,
}, PureRec2020, YCbCr_Rec2020)
SystemRec2020_Pure24 = NewRelativeSystem(Chromaticity{
Red: ColorCoordinate{0.708, 0.292},
Green: ColorCoordinate{0.170, 0.797},
Blue: ColorCoordinate{0.131, 0.046},
White: IlluminantD65,
}, PureRec2020, YCbCr_Rec2020)
)

66
colorspace/types.go Normal file
View file

@ -0,0 +1,66 @@
package colorspace
import "math"
type TransferFunction func(e float64) float64
const DCINormalizationFactor = 48 / 52.37
const (
Gamma22 = 2.2
Gamma24 = 2.4
GammaDCIXYZ = 2.6
GammaSRGB = Gamma22
GammaRec709 = 1 / 0.45
GammaRec2020 = GammaRec709
)
// https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.2020-2-201510-I!!PDF-E.pdf
const alpha = 1.09929682680944 // 10 * Math.pow(beta, 0.55)
const beta = 0.018053968510807
var (
TransferFromDCIXYZ TransferFunction = func(e float64) float64 {
return e / DCINormalizationFactor
}
CompandingSRGB TransferFunction = func(e float64) float64 {
if e <= 0.0031308 {
return 12.92 * e
} else {
return 1.055*math.Pow(e, 1/GammaSRGB) - 0.055
}
}
// CompandingRec709 ITU-R BT.709
CompandingRec709 TransferFunction = func(e float64) float64 {
if e < beta {
return 4.5 * e
} else {
return alpha*math.Pow(e, 1/GammaRec709) - (alpha - 1)
}
}
PureRec709 TransferFunction = func(e float64) float64 {
return math.Pow(e, 1/GammaRec709)
}
PureRec709_22 TransferFunction = func(e float64) float64 {
return math.Pow(e, 1/Gamma22)
}
PureRec709_24 TransferFunction = func(e float64) float64 {
return math.Pow(e, 1/Gamma24)
}
CompandingRec2020 = CompandingRec709
PureRec2020 = PureRec709
PureRec2020_22 = PureRec709_22
PureRec2020_24 = PureRec709_24
)
type LUT []float64
func NewGammaLUT(gamma float64, bits int) (lut LUT) {
size := (1 << bits) - 1
lut = make(LUT, size+1)
for i := 0; i <= size; i++ {
lut[i] = math.Pow(float64(i)/float64(size), gamma)
}
return lut
}

28
colorspace/xyz.go Normal file
View file

@ -0,0 +1,28 @@
package colorspace
import "math"
// XYZSystem An absolute representation
type XYZSystem struct {
gamma float64
ToLinearLUT LUT
// ToLinear any adjustements on top of gamma
ToLinearTransfer TransferFunction
}
func (s XYZSystem) ToLinearFrom16(c uint16) float64 {
return s.ToLinearTransfer(s.ToLinearLUT[c])
}
func (s XYZSystem) ToLinear(c float64) float64 {
return s.ToLinearTransfer(math.Pow(c, s.gamma))
}
func NewXYZSystem(gamma float64, toLinearTransfer TransferFunction) XYZSystem {
return XYZSystem{
ToLinearLUT: NewGammaLUT(gamma, 16),
ToLinearTransfer: toLinearTransfer,
}
}
var DCIXYZSystem = NewXYZSystem(GammaDCIXYZ, TransferFromDCIXYZ)

35
colorspace/yuv.go Normal file
View file

@ -0,0 +1,35 @@
package colorspace
import "gonum.org/v1/gonum/mat"
type YCbCrConverter struct {
Kr, Kg, Kb float64
}
func (c YCbCrConverter) ConversionRGB() (to, from *mat.Dense) {
const half = 1. / 2.
RgbToYPbPr := mat.NewDense(3, 3, []float64{
c.Kr, c.Kg, c.Kb,
-half * (c.Kr / (1 - c.Kb)), -half * (c.Kg / (1 - c.Kb)), half,
half, -half * (c.Kg / (1 - c.Kr)), -half * (c.Kb / (1 - c.Kr)),
})
YPbPrToRgb := mat.NewDense(3, 3, []float64{
1, 0, 2 - 2*c.Kr,
1, -(c.Kb / c.Kg) * (2 - 2*c.Kb), -(c.Kr / c.Kg) * (2 - 2*c.Kr),
1, 2 - 2*c.Kb, 0,
})
return YPbPrToRgb, RgbToYPbPr
}
func NewYCbCrConverter(kr, kg, kb float64) YCbCrConverter {
return YCbCrConverter{
Kr: kr,
Kg: kg,
Kb: kb,
}
}
var YCbCr_Rec709 = NewYCbCrConverter(0.2126, 0.7152, 0.0722)
var YCbCr_Rec2020 = NewYCbCrConverter(0.2127, 0.6780, 0.0593)

103
conv/conv.c Normal file
View file

@ -0,0 +1,103 @@
#include "conv.h"
#include <math.h>
#include <stdlib.h>
const double uint16Max = (double)((1<<16)-1);
const double int16MaxPlusOne = (double)(1<<15);
const double* restrict xyz2rgb_g;
const double* restrict rgb2yuv_g;
double rgbGamma_g;
#if USE_SIMD
#include "conv_gen.h"
#endif
#if USE_SIMD && __AVX2__
#include "conv_avx.h"
#if USE_512_WIDE_PIPELINE && __AVX512F__ && __AVX512VL__ && __AVX512BW__
//512-bit wide pipeline
typed_converter_platform(x86_64, SIMD_PUMPS, float, 5, 1, load_packed_512f, store_packed_512f);
typed_converter_platform(x86_64, SIMD_PUMPS, double, 2, 1, load_packed_512, store_packed_512);
const char* DecoderInformation = "SIMD AVX-512 512-bit pipeline (2d 5f " str(SIMD_PUMPS) "pump)";
#else
#if !defined(__FMA__)
#define __FMA__ 0
const char* DecoderInformation = "SIMD AVX2 256-bit pipeline (1d 2f " str(SIMD_PUMPS) "pump)";
#else
const char* DecoderInformation = "SIMD AVX2 + FMA 256-bit pipeline (1d 2f " str(SIMD_PUMPS) "pump)";
#endif
//256-bit wide pipeline
typed_converter_platform(x86_64, SIMD_PUMPS, float, 2, __FMA__, load_packed_256f, store_packed_256f);
typed_converter_platform(x86_64, SIMD_PUMPS, double, 1, __FMA__, load_packed_256, store_packed_256);
#endif
#else
//No ASM defined
#define fType float
#include "conv_generic.h"
#undef fType
#define fType double
#include "conv_generic.h"
#undef USE_SIMD
#define USE_SIMD 0
const char* DecoderInformation = "Generic scalar pipeline (1d 1f 1pump)";
#endif
const char* decoder_information() {
return DecoderInformation;
}
void init(const double* restrict xyz2rgb, const double* restrict rgb2yuv, double xyzGamma, double rgbGamma) {
xyz2rgb_g = xyz2rgb;
rgb2yuv_g = rgb2yuv;
rgbGamma_g = 1./rgbGamma;
for (int i = 0; i < (1 << XYZ_LOOKUP_TABLE_SIZE); i++) {
xyz12_to_linear_double[i] = pow((double)(i)/((1 << XYZ_LOOKUP_TABLE_SIZE)-1), xyzGamma);
xyz12_to_linear_float[i] = xyz12_to_linear_double[i];
}
#if USE_SIMD
_load_matrix_double(xyz2rgb_mat_double, xyz2rgb);
_load_matrix_float(xyz2rgb_mat_float, xyz2rgb);
_load_matrix_double(rgb2yuv_mat_double, rgb2yuv);
_load_matrix_float(rgb2yuv_mat_float, rgb2yuv);
#endif
}
void convert_frame_dci_xyz12_to_yuv16_float(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) {
for (int i = 0; i < height; i++){
convert_line_dci_xyz12_to_yuv16_float(in, luma, cb, cr, width, height);
in += width*3;
luma += width;
cb += width;
cr += width;
}
}
void convert_frame_dci_xyz12_to_yuv16_double(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) {
for (int i = 0; i < height; i++){
convert_line_dci_xyz12_to_yuv16_double(in, luma, cb, cr, width, height);
in += width*3;
luma += width;
cb += width;
cr += width;
}
}

127
conv/conv.go Normal file
View file

@ -0,0 +1,127 @@
package conv
import (
"gonum.org/v1/gonum/mat"
"runtime"
"unsafe"
)
/*
#cgo CFLAGS: -march=native -mtune=native -Ofast -std=c99
#cgo LDFLAGS: -lm
#include "conv.h"
*/
import "C"
var matPinner runtime.Pinner
func DecoderInformation() string {
return C.GoString(C.decoder_information())
}
func InitData(xyz2rgb, rgb2yuv *mat.Dense, xyzGamma, rgbGamma float64) {
a := unsafe.Pointer(unsafe.SliceData(xyz2rgb.RawMatrix().Data))
b := unsafe.Pointer(unsafe.SliceData(rgb2yuv.RawMatrix().Data))
matPinner.Pin(a)
matPinner.Pin(b)
C.init((*C.double)(a), (*C.double)(b), C.double(xyzGamma), C.double(rgbGamma))
}
func ConvertFrameDCIXYZToYUV16Double(in []uint32, luma, cb, cr []uint16, width, height int) {
var pinner runtime.Pinner
inp := unsafe.Pointer(unsafe.SliceData(in))
lumap := unsafe.Pointer(unsafe.SliceData(luma))
cbp := unsafe.Pointer(unsafe.SliceData(cb))
crp := unsafe.Pointer(unsafe.SliceData(cr))
pinner.Pin(inp)
pinner.Pin(lumap)
pinner.Pin(cbp)
pinner.Pin(crp)
defer pinner.Unpin()
C.convert_frame_dci_xyz12_to_yuv16_double(
(*C.uint)(inp),
(*C.ushort)(lumap),
(*C.ushort)(cbp),
(*C.ushort)(crp),
C.int(width),
C.int(height),
)
}
func ConvertLineDCIXYZToYUV16Double(in []uint32, luma, cb, cr []uint16, width, height int) {
var pinner runtime.Pinner
inp := unsafe.Pointer(unsafe.SliceData(in))
lumap := unsafe.Pointer(unsafe.SliceData(luma))
cbp := unsafe.Pointer(unsafe.SliceData(cb))
crp := unsafe.Pointer(unsafe.SliceData(cr))
pinner.Pin(inp)
pinner.Pin(lumap)
pinner.Pin(cbp)
pinner.Pin(crp)
defer pinner.Unpin()
C.convert_line_dci_xyz12_to_yuv16_double(
(*C.uint)(inp),
(*C.ushort)(lumap),
(*C.ushort)(cbp),
(*C.ushort)(crp),
C.int(width),
C.int(height),
)
}
func ConvertFrameDCIXYZToYUV16Float(in []uint32, luma, cb, cr []uint16, width, height int) {
var pinner runtime.Pinner
inp := unsafe.Pointer(unsafe.SliceData(in))
lumap := unsafe.Pointer(unsafe.SliceData(luma))
cbp := unsafe.Pointer(unsafe.SliceData(cb))
crp := unsafe.Pointer(unsafe.SliceData(cr))
pinner.Pin(inp)
pinner.Pin(lumap)
pinner.Pin(cbp)
pinner.Pin(crp)
defer pinner.Unpin()
C.convert_frame_dci_xyz12_to_yuv16_float(
(*C.uint)(inp),
(*C.ushort)(lumap),
(*C.ushort)(cbp),
(*C.ushort)(crp),
C.int(width),
C.int(height),
)
}
func ConvertLineDCIXYZToYUV16Float(in []uint32, luma, cb, cr []uint16, width, height int) {
var pinner runtime.Pinner
inp := unsafe.Pointer(unsafe.SliceData(in))
lumap := unsafe.Pointer(unsafe.SliceData(luma))
cbp := unsafe.Pointer(unsafe.SliceData(cb))
crp := unsafe.Pointer(unsafe.SliceData(cr))
pinner.Pin(inp)
pinner.Pin(lumap)
pinner.Pin(cbp)
pinner.Pin(crp)
defer pinner.Unpin()
C.convert_line_dci_xyz12_to_yuv16_float(
(*C.uint)(inp),
(*C.ushort)(lumap),
(*C.ushort)(cbp),
(*C.ushort)(crp),
C.int(width),
C.int(height),
)
}

50
conv/conv.h Normal file
View file

@ -0,0 +1,50 @@
#include <stdint.h>
void init(const double* restrict xyz2rgb, const double* restrict rgb2yuv, double xyzGamma, double rgbGamma);
// Use available SIMD. Disable to enforce generic pipeline.
#if !defined(USE_SIMD)
#define USE_SIMD 1
#endif
// Opportunistically use AVX512 features even in 256-bit mode
#if !defined(USE_OPPORTUNISTIC_AVX512)
#define USE_OPPORTUNISTIC_AVX512 1
#endif
//AVX2
// double layout aaa0
// float layout aaabbb00
// Enable usage of 512-bit wide pipeline, pumping two pixels every iteration, if supported.
// Uses AVX-512 features. Requires AVX-512 F, VL, BW. Layout aaabbb00
// double layout aaabbb00
// float layout aaabbbcccdddeee0
#if !defined(USE_512_WIDE_PIPELINE)
#define USE_512_WIDE_PIPELINE 1
#endif
// Sets the number of pumps per iteration on pipeline. Supported 1, 2, 4, 8
// Set this if your architecture has large amount of executors than normal. Recommended to stay at 2 or 4.
#if !defined(SIMD_PUMPS)
#define SIMD_PUMPS 1
#endif
// ExpandLoad or CompressStore is slower than currently doing set(vals...). todo: inspect with newer CPUs than ZEN4
#if !defined(USE_AVX512_EXPANDLOAD)
#define USE_AVX512_EXPANDLOAD 0
#endif
// Size of the lookup table. Only valid value is 12.
#if !defined(XYZ_LOOKUP_TABLE_SIZE)
#define XYZ_LOOKUP_TABLE_SIZE 12
#endif
// This function can run out of bounds slightly, about 24 bytes per pump per line (XYZ) or 8 bytes per pump per line (YUV), caller should allocates extra input/output buffer for this reason.
void convert_line_dci_xyz12_to_yuv16_float(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
void convert_frame_dci_xyz12_to_yuv16_float(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
void convert_line_dci_xyz12_to_yuv16_double(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
void convert_frame_dci_xyz12_to_yuv16_double(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
const char* decoder_information();

271
conv/conv_avx.h Normal file
View file

@ -0,0 +1,271 @@
#include <immintrin.h>
#include <stdint.h>
// x86_64 AVX2 and AVX512 definitions
#define _perm_component_i(i, c, v) ((i*v+c)*2)
#define _perm_component2(c) _perm_component_i(1, c, 4), _perm_component_i(0, c, 4)
#define _perm_component5(c) _perm_component_i(4, c, 3), _perm_component_i(3, c, 3), _perm_component_i(2, c, 3), _perm_component_i(1, c, 3), _perm_component_i(0, c, 3)
#if USE_512_WIDE_PIPELINE && __AVX512F__ && __AVX512VL__ && __AVX512BW__ && __AVX512DQ__
__m256i load_packed_512(int j, const uint32_t* restrict in) {
#if USE_AVX512_EXPANDLOAD
//todo: this path is slower than _mm256_set_epi32, both mask and maskz ZEN4
return _mm256_maskz_expandloadu_epi32(_cvtu32_mask8(0b01110111), in+j*3);
#else
return _mm256_permutexvar_epi32(_mm256_set_epi32(7, 5, 4, 3, 7, 2, 1, 0), _mm256_maskz_loadu_epi32(_cvtu32_mask8(0b00111111), in+j*3));
#endif
}
void store_packed_512(__m256i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
#define _perm_idx512 _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _perm_component2(2), _perm_component2(1), _perm_component2(0))
__m128i store = _mm256_castsi256_si128(_mm256_permutexvar_epi16(_perm_idx512, packed));
_mm_mask_storeu_epi16(&luma[j], _cvtu32_mask8(0b00000011), store);
_mm_mask_storeu_epi16(&cb[j-2], _cvtu32_mask8(0b00001100), store);
_mm_mask_storeu_epi16(&cr[j-4], _cvtu32_mask8(0b00110000), store);
}
__m512i load_packed_512f(int j, const uint32_t* restrict in) {
return _mm512_maskz_loadu_epi32(_cvtu32_mask16(0b0111111111111111), in+j*3);
}
void store_packed_512f(__m512i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
#define _perm_idx512f _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _perm_component5(2), _perm_component5(1), _perm_component5(0))
__m256i store = _mm512_castsi512_si256(_mm512_permutexvar_epi16(_perm_idx512f, packed));
_mm256_mask_storeu_epi16(&luma[j ], _cvtu32_mask16(0b0000000000011111), store);
_mm256_mask_storeu_epi16(&cb[j -5], _cvtu32_mask16(0b0000001111100000), store);
_mm256_mask_storeu_epi16(&cr[j-10], _cvtu32_mask16(0b0111110000000000), store);
}
#define SSE 1
#define AVX 1
#define AVX512 1
#include "simd_utils/simd_utils.h"
static inline __m512d pow512_pd1(__m512d x, double y1) {
const __m512d y = _mm512_set1_pd(y1);
return exp512_pd(_mm512_mul_pd(y, log512_pd(x)));
}
static inline __m512 pow512_ps1(__m512 x, double y1) {
const __m512d y = _mm512_set1_pd(y1);
//low precision cause issues, use doubles
__m256 a = _mm512_cvtpd_ps(exp512_pd(_mm512_mul_pd(y, log512_pd(_mm512_cvtps_pd(_mm512_castps512_ps256(x))))));
__m256 b = _mm512_cvtpd_ps(exp512_pd(_mm512_mul_pd(y, log512_pd(_mm512_cvtps_pd(_mm512_extractf32x8_ps(x, 1))))));
return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
}
#else
__m128i load_packed_256(int j, const uint32_t* restrict in) {
#if USE_OPPORTUNISTIC_AVX512 && __AVX512F__ && __AVX512VL__ && __AVX512DQ__
return _mm_maskz_loadu_epi32(_cvtu32_mask8(0b00000111), in+j*3);
#else
return _mm_loadu_epi32(in+j*3);
//return _mm_set_epi32(0, inz[j], iny[j], inx[j]);
#endif
}
void store_packed_256(__m128i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
luma[j] = ((int32_t*)(&packed))[0];
cb[j] = ((int32_t*)(&packed))[1];
cr[j] = ((int32_t*)(&packed))[2];
}
/*#if USE_DOT_PRODUCT
//position entries properly in vectors ready for dot product
m[0] = _mm256_set_ps(0, in[2], in[1], in[0], 0, in[2], in[1], in[0]); //0rrr 0rrr
m[1] = _mm256_set_ps(0, in[5], in[4], in[3], 0, in[5], in[4], in[3]); //0ggg 0ggg
m[2] = _mm256_set_ps(0, in[8], in[7], in[6], 0, in[8], in[7], in[6]); //0bbb 0bbb
__m256 a = _mm256_dp_ps(v, m0, 0b01110001);
__m256 b = _mm256_dp_ps(v, m1, 0b01110010);
__m256 c = _mm256_dp_ps(v, m2, 0b01110100);
return _mm256_blend_ps(_mm256_blend_ps(a, b, 0b00100010), c, 0b01000100);
*/
__m256i load_packed_256f(int j, const uint32_t* restrict in) {
#if USE_OPPORTUNISTIC_AVX512 && USE_AVX512_EXPANDLOAD && __AVX512VBMI2__ && __AVX512VL__ && __AVX512F__
//todo: slow ZEN4
return _mm256_maskz_expandloadu_epi32(_cvtu32_mask8(0b01110111), in+j*3);
#else
#if USE_OPPORTUNISTIC_AVX512 && __AVX512F__ && __AVX512BW__ && __AVX512VL__ && __AVX512DQ__
return _mm256_permutexvar_epi32(_mm256_set_epi32(7, 5, 4, 3, 7, 2, 1, 0), _mm256_maskz_loadu_epi32(_cvtu32_mask8(0b00111111), in+j*3));
#else
return _mm256_set_epi32(0, in[j+5], in[j+4], in[j+3], 0, in[j+2], in[j+1], in[j]);
#endif
#endif
}
void store_packed_256f(__m256i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
#if USE_OPPORTUNISTIC_AVX512 && __AVX512BW__ && __AVX512VL__ && __AVX512DQ__
#define _perm_idx256 _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _perm_component2(2), _perm_component2(1), _perm_component2(0))
__m128i store = _mm256_castsi256_si128(_mm256_permutexvar_epi16(_perm_idx256, packed));
_mm_mask_storeu_epi16(&luma[j], _cvtu32_mask8(0b00000011), store);
_mm_mask_storeu_epi16(&cb[j-2], _cvtu32_mask8(0b00001100), store);
_mm_mask_storeu_epi16(&cr[j-4], _cvtu32_mask8(0b00110000), store);
#else
luma[j/3] = ((int32_t*)(&packed))[0];
cb[j/3] = ((int32_t*)(&packed))[1];
cr[j/3] = ((int32_t*)(&packed))[2];
luma[j/3+1] = ((int32_t*)(&packed))[4];
cb[j/3+1] = ((int32_t*)(&packed))[5];
cr[j/3+1] = ((int32_t*)(&packed))[6];
#endif
}
#define SSE 1
#define AVX 1
#include "simd_utils/simd_utils.h"
static inline __m256d pow256_pd1(__m256d x, double y1) {
const __m256d y = _mm256_set1_pd(y1);
return exp256_pd(_mm256_mul_pd(y, log256_pd(x)));
}
static inline __m256 pow256_ps1(__m256 x, double y1) {
const __m256d y = _mm256_set1_pd(y1);
//low precision cause issues, use doubles
__m128 a = _mm256_cvtpd_ps(exp256_pd(_mm256_mul_pd(y, log256_pd(_mm256_cvtps_pd(_mm256_castps256_ps128(x))))));
__m128 b = _mm256_cvtpd_ps(exp256_pd(_mm256_mul_pd(y, log256_pd(_mm256_cvtps_pd(_mm256_extractf128_ps(x, 1))))));
return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
}
#endif
#define typed_vector_size_x86_64_float_5 512
#define typed_vector_size_x86_64_double_2 512
#define typed_vector_size_x86_64_float_2 256
#define typed_vector_size_x86_64_double_1 256
#define typed_permute_lanes_x86_64_float_5 _permutexvar_
#define typed_permute_lanes_x86_64_double_2 _permutex_
#define typed_permute_lanes_x86_64_float_2 _permute_
#define typed_permute_lanes_x86_64_double_1 _permute4x64_
inline __attribute__((always_inline)) __m512 _int_i32gather_ps(void const* base_addr, __m512i vindex, int scale) {
switch(scale){
case 1:
return _mm512_i32gather_ps(vindex, base_addr, 1);
case 2:
return _mm512_i32gather_ps(vindex, base_addr, 2);
case 4:
return _mm512_i32gather_ps(vindex, base_addr, 4);
case 8:
return _mm512_i32gather_ps(vindex, base_addr, 8);
default:
__builtin_unreachable();
}
}
inline __attribute__((always_inline)) __m512d _int_i32gather_pd(void const* base_addr, __m256i vindex, int scale) {
switch(scale){
case 1:
return _mm512_i32gather_pd(vindex, base_addr, 1);
case 2:
return _mm512_i32gather_pd(vindex, base_addr, 2);
case 4:
return _mm512_i32gather_pd(vindex, base_addr, 4);
case 8:
return _mm512_i32gather_pd(vindex, base_addr, 8);
default:
__builtin_unreachable();
}
}
#define typed_pow1_x86_64_float_5 pow512_ps1
#define typed_pow1_x86_64_double_2 pow512_pd1
#define typed_pow1_x86_64_float_2 pow256_ps1
#define typed_pow1_x86_64_double_1 pow256_pd1
#define typed_pow1_x86_64(floatType, elementCount) typed_pow1_x86_64_##floatType##_##elementCount
#define typed_i32gather_x86_64_float_5 _int_i32gather_ps
#define typed_i32gather_x86_64_double_2 _int_i32gather_pd
#define typed_i32gather_x86_64_float_2 _mm256_i32gather_ps
#define typed_i32gather_x86_64_double_1 _mm256_i32gather_pd
#define typed_i32gather_x86_64(floatType, elementCount) typed_i32gather_x86_64_##floatType##_##elementCount
#define typed_vector_func_prefix_x86_64 _mm
#define typed_vector_type_prefix_x86_64 __m
#define typed_vector_type_suffix_x86_64_double d
#define typed_vector_type_suffix_x86_64_int i
#define typed_vector_int_size_x86_64_float_5 512
#define typed_vector_int_func_x86_64_float_5 concat(typed_vector_func_prefix_x86_64, typed_vector_int_size_x86_64_float_5)
#define typed_vector_int_size_x86_64_double_2 256
#define typed_vector_int_func_x86_64_double_2 concat(typed_vector_func_prefix_x86_64, typed_vector_int_size_x86_64_double_2)
#define typed_vector_int_size_x86_64_float_2 256
#define typed_vector_int_func_x86_64_float_2 concat(typed_vector_func_prefix_x86_64, typed_vector_int_size_x86_64_float_2)
#define typed_vector_int_size_x86_64_double_1 128
#define typed_vector_int_func_x86_64_double_1 typed_vector_func_prefix_x86_64
#define typed_vector_func_type_x86_64_float ps
#define typed_vector_func_type_x86_64_double pd
#define typed_vector_type_x86_64_float(elementCount) concat(typed_vector_type_prefix_x86_64, typed_vector_size_x86_64_float_##elementCount)
#define typed_vector_type_x86_64_double(elementCount) concat3(typed_vector_type_prefix_x86_64, typed_vector_size_x86_64_double_##elementCount, typed_vector_type_suffix_x86_64_double)
#define typed_vector_type_x86_64(floatType, elementCount) typed_vector_type_x86_64_##floatType(elementCount)
#define typed_vector_int_type_x86_64(floatType, elementCount) concat3(typed_vector_type_prefix_x86_64, typed_vector_int_size_x86_64_##floatType##_##elementCount, typed_vector_type_suffix_x86_64_int)
#define typed_vector_func_x86_64_float(elementCount) concat(typed_vector_func_prefix_x86_64, typed_vector_size_x86_64_float_##elementCount)
#define typed_vector_func_x86_64_double(elementCount) concat(typed_vector_func_prefix_x86_64, typed_vector_size_x86_64_double_##elementCount)
#define typed_vector_func_x86_64(floatType, elementCount) typed_vector_func_x86_64_##floatType(elementCount)
#define typed_permute_lanes_x86_64_float_5 _permutexvar_
#define typed_permute_lanes_x86_64_double_2 _permutex_
#define typed_permute_lanes_x86_64_float_2 _permute_
#define typed_permute_lanes_x86_64_double_1 _permute4x64_
#define typed_fmadd_x86_64 _fmadd_
#define typed_add_x86_64 _add_
#define typed_mul_x86_64 _mul_
#define typed_set1_x86_64 _set1_
#define typed_set_x86_64 _set_
#define typed_min_x86_64 _min_
#define typed_max_x86_64 _max_
#define typed_seti_x86_64 _set_epi32
#define typed_addi_x86_64 _add_epi32
#define typed_cvt_x86_64_float _cvtps_epi32
#define typed_cvt_x86_64_double _cvtpd_epi32
#define typed_func_x86_64_fmadd(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_fmadd_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_add(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_add_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_mul(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_mul_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_set1(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_set1_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_set(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_set_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_min(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_min_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_max(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_max_x86_64, typed_vector_func_type_x86_64_##floatType)
#define typed_func_x86_64_ftoi(floatType, elementCount) concat(typed_vector_func_x86_64(floatType, elementCount), typed_cvt_x86_64_##floatType)
#define typed_func_x86_64_seti(floatType, elementCount) concat(typed_vector_int_func_x86_64_##floatType##_##elementCount, typed_seti_x86_64)
#define typed_func_x86_64_addi(floatType, elementCount) concat(typed_vector_int_func_x86_64_##floatType##_##elementCount, typed_addi_x86_64)
#define typed_func_x86_64_pow1(floatType, elementCount) typed_pow1_x86_64(floatType, elementCount)
#define typed_func_x86_64_i32gather(floatType, elementCount) typed_i32gather_x86_64(floatType, elementCount)
#define typed_func_x86_64_permute_lanes(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_permute_lanes_x86_64_##floatType##_##elementCount, typed_vector_func_type_x86_64_##floatType)
#define typed_permute_lanes_x86_64_float_5 _permutexvar_
#define typed_permute_lanes_x86_64_double_2 _permutex_
#define typed_permute_lanes_x86_64_float_2 _permute_
#define typed_permute_lanes_x86_64_double_1 _permute4x64_

420
conv/conv_gen.h Normal file
View file

@ -0,0 +1,420 @@
#define _lit(s) s
#define lit(s) _lit(s)
#define _str(s) #s
#define str(s) _str(s)
#define _concat(a,b) a##b
#define concat(a,b) _concat(a,b)
#define _concat3(a,b,c) a##b##c
#define concat3(a,b,c) _concat3(a,b,c)
#if XYZ_LOOKUP_TABLE_SIZE==12
#define typed_gather_double(i32gather, packed) i32gather(xyz12_to_linear_double, packed, 8)
#define typed_gather_float(i32gather, packed) i32gather(xyz12_to_linear_float, packed, 4)
#else
#error "Not supported"
#endif
#define _shuf_lane_step1(l) 3*l, 3*l+2, 3*l+1
#define _shuf_lane_step2(l) 3*l+1, 3*l, 3*l+2
#define _shuf_idx(seti, step) seti(0xf, _shuf_lane_step##step(4), _shuf_lane_step##step(3), _shuf_lane_step##step(2), _shuf_lane_step##step(1), _shuf_lane_step##step(0))
// xxxxxyyyyyzzzzz0
#define typed_mul_vec_dot_5_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
fmadd( \
permuteLanesVar(_shuf_idx(seti, 2), v), \
m2, \
fmadd( \
permuteLanesVar(_shuf_idx(seti, 1), v), \
m1, \
mul(v, m0) \
) \
)
#define typed_mul_vec_dot_2_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
fmadd( \
permuteLanes(v, 0b11010010), \
m2, \
fmadd( \
permuteLanes(v, 0b11001001), \
m1, \
mul(v, m0) \
) \
)
#define typed_mul_vec_dot_2_fma_0(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
add( \
add( \
mul(v, m0), \
mul(permuteLanes(v, 0b11001001), m1) \
), \
mul(permuteLanes(v, 0b11010010), m2) \
)
#define typed_mul_vec_dot_1_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
typed_mul_vec_dot_2_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v)
#define typed_mul_vec_dot_1_fma_0(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
typed_mul_vec_dot_2_fma_0(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v)
#define typed_mul_vec_dot_1(fma, fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
typed_mul_vec_dot_1_fma_##fma(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v)
#define typed_mul_vec_dot_2(fma, fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
typed_mul_vec_dot_2_fma_##fma(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v)
#define typed_mul_vec_dot_5(fma, fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
typed_mul_vec_dot_5_fma_##fma(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v)
#define typed_yuv_add_5(seti) \
seti(0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0)
#define typed_yuv_add_2(seti) \
seti(0, int16MaxPlusOne, int16MaxPlusOne, 0, 0, int16MaxPlusOne, int16MaxPlusOne, 0)
#define typed_yuv_add_1(seti) \
seti(0, int16MaxPlusOne, int16MaxPlusOne, 0)
#define typed_mat_load_func_5(setf, m, in) \
m[0] = setf(0, in[8], in[4], in[0], in[8], in[4], in[0], in[8], in[4], in[0], in[8], in[4], in[0], in[8], in[4], in[0]); /*0 bgr bgr bgr bgr bgr*/ \
m[1] = setf(0, in[6], in[5], in[1], in[6], in[5], in[1], in[6], in[5], in[1], in[6], in[5], in[1], in[6], in[5], in[1]); /*0 rbg rbg rbg rbg rbg*/ \
m[2] = setf(0, in[7], in[3], in[2], in[7], in[3], in[2], in[7], in[3], in[2], in[7], in[3], in[2], in[7], in[3], in[2]); /*0 grb grb grb grb grb*/
#define typed_mat_load_func_2(setf, m, in) \
m[0] = setf(0, in[8], in[4], in[0], 0, in[8], in[4], in[0]); /* 0bgr 0bgr*/ \
m[1] = setf(0, in[6], in[5], in[1], 0, in[6], in[5], in[1]); /* 0rbg 0rbg*/ \
m[2] = setf(0, in[7], in[3], in[2], 0, in[7], in[3], in[2]); /* 0grb 0grb*/
#define typed_mat_load_func_1(setf, m, in) \
m[0] = setf(0, in[8], in[4], in[0]); /* 0bgr*/ \
m[1] = setf(0, in[6], in[5], in[1]); /* 0rbg*/ \
m[2] = setf(0, in[7], in[3], in[2]); /* 0grb*/
// todo: optimize pow via https://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent ?
#define _load_matrix_func _load_matrix_
#define _line_func convert_line_dci_xyz12_to_yuv16_
//concat(_line_func, floatType)
#define typed_converter_pump1(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
\
void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
typed_mat_load_func_##elementCount(setf, m, in) \
}; \
\
\
inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
iPackedType packed; \
fVectorType xyz; \
fVectorType rgb; \
fVectorType yuv; \
\
const fVectorType minValue = set1f(1.); \
const fVectorType maxValue = set1f(0.); \
const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
\
const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
\
const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
\
for (int j = 0; j < width; j += elementCount) { \
packed = packedLoadFunc(j, in); \
\
xyz = typed_gather_##floatType(i32gather, packed); \
\
rgb = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz); \
\
rgb = maxf(minf(rgb, minValue), maxValue); \
\
rgb = fpow(rgb, rgbGamma_g); \
\
yuv = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb); \
\
packed = addi(ftoi(yuv), yuv2packed_add); \
\
packedStoreFunc(packed, j, luma, cb, cr); \
} \
};
#define typed_converter_pump2(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
\
void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
typed_mat_load_func_##elementCount(setf, m, in) \
}; \
\
\
inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
iPackedType packed0, packed1; \
fVectorType xyz0, xyz1; \
fVectorType rgb0, rgb1; \
fVectorType yuv0, yuv1; \
\
const fVectorType minValue = set1f(1.); \
const fVectorType maxValue = set1f(0.); \
const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
\
const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
\
const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
\
for (int j = 0; j < width; j += elementCount*2) { \
packed0 = packedLoadFunc(j, in); \
packed1 = packedLoadFunc(j+elementCount, in); \
\
xyz0 = typed_gather_##floatType(i32gather, packed0); \
xyz1 = typed_gather_##floatType(i32gather, packed1); \
\
rgb0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz0); \
rgb1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz1); \
\
rgb0 = maxf(minf(rgb0, minValue), maxValue); \
rgb1 = maxf(minf(rgb1, minValue), maxValue); \
\
rgb0 = fpow(rgb0, rgbGamma_g); \
rgb1 = fpow(rgb1, rgbGamma_g); \
\
yuv0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb0); \
yuv1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb1); \
\
packed0 = addi(ftoi(yuv0), yuv2packed_add); \
packed1 = addi(ftoi(yuv1), yuv2packed_add); \
\
packedStoreFunc(packed0, j, luma, cb, cr); \
packedStoreFunc(packed1, j+elementCount, luma, cb, cr); \
} \
};
#define typed_converter_pump4(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
\
void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
typed_mat_load_func_##elementCount(setf, m, in) \
}; \
\
\
inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
iPackedType packed0, packed1, packed2, packed3; \
fVectorType xyz0, xyz1, xyz2, xyz3; \
fVectorType rgb0, rgb1, rgb2, rgb3; \
fVectorType yuv0, yuv1, yuv2, yuv3; \
\
const fVectorType minValue = set1f(1.); \
const fVectorType maxValue = set1f(0.); \
const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
\
const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
\
const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
\
for (int j = 0; j < width; j += elementCount*4) { \
packed0 = packedLoadFunc(j, in); \
packed1 = packedLoadFunc(j+elementCount, in); \
packed2 = packedLoadFunc(j+elementCount*2, in); \
packed3 = packedLoadFunc(j+elementCount*3, in); \
\
xyz0 = typed_gather_##floatType(i32gather, packed0); \
xyz1 = typed_gather_##floatType(i32gather, packed1); \
xyz2 = typed_gather_##floatType(i32gather, packed2); \
xyz3 = typed_gather_##floatType(i32gather, packed3); \
\
rgb0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz0); \
rgb1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz1); \
rgb2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz2); \
rgb3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz3); \
\
rgb0 = maxf(minf(rgb0, minValue), maxValue); \
rgb1 = maxf(minf(rgb1, minValue), maxValue); \
rgb2 = maxf(minf(rgb2, minValue), maxValue); \
rgb3 = maxf(minf(rgb3, minValue), maxValue); \
\
rgb0 = fpow(rgb0, rgbGamma_g); \
rgb1 = fpow(rgb1, rgbGamma_g); \
rgb2 = fpow(rgb2, rgbGamma_g); \
rgb3 = fpow(rgb3, rgbGamma_g); \
\
yuv0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb0); \
yuv1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb1); \
yuv2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb2); \
yuv3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb3); \
\
packed0 = addi(ftoi(yuv0), yuv2packed_add); \
packed1 = addi(ftoi(yuv1), yuv2packed_add); \
packed2 = addi(ftoi(yuv2), yuv2packed_add); \
packed3 = addi(ftoi(yuv3), yuv2packed_add); \
\
packedStoreFunc(packed0, j, luma, cb, cr); \
packedStoreFunc(packed1, j+elementCount, luma, cb, cr); \
packedStoreFunc(packed2, j+elementCount*2, luma, cb, cr); \
packedStoreFunc(packed3, j+elementCount*3, luma, cb, cr); \
} \
};
#define typed_converter_pump8(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
\
void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
typed_mat_load_func_##elementCount(setf, m, in) \
}; \
\
\
inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
iPackedType packed0, packed1, packed2, packed3, packed4, packed5, packed6, packed7; \
fVectorType xyz0, xyz1, xyz2, xyz3, xyz4, xyz5, xyz6, xyz7; \
fVectorType rgb0, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7; \
fVectorType yuv0, yuv1, yuv2, yuv3, yuv4, yuv5, yuv6, yuv7; \
\
const fVectorType minValue = set1f(1.); \
const fVectorType maxValue = set1f(0.); \
const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
\
const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
\
const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
\
for (int j = 0; j < width; j += elementCount*8) { \
packed0 = packedLoadFunc(j, in); \
packed1 = packedLoadFunc(j+elementCount, in); \
packed2 = packedLoadFunc(j+elementCount*2, in); \
packed3 = packedLoadFunc(j+elementCount*3, in); \
packed4 = packedLoadFunc(j+elementCount*4, in); \
packed5 = packedLoadFunc(j+elementCount*5, in); \
packed6 = packedLoadFunc(j+elementCount*6, in); \
packed7 = packedLoadFunc(j+elementCount*7, in); \
\
xyz0 = typed_gather_##floatType(i32gather, packed0); \
xyz1 = typed_gather_##floatType(i32gather, packed1); \
xyz2 = typed_gather_##floatType(i32gather, packed2); \
xyz3 = typed_gather_##floatType(i32gather, packed3); \
xyz4 = typed_gather_##floatType(i32gather, packed4); \
xyz5 = typed_gather_##floatType(i32gather, packed5); \
xyz6 = typed_gather_##floatType(i32gather, packed6); \
xyz7 = typed_gather_##floatType(i32gather, packed7); \
\
rgb0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz0); \
rgb1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz1); \
rgb2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz2); \
rgb3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz3); \
rgb4 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz4); \
rgb5 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz5); \
rgb6 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz6); \
rgb7 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz7); \
\
rgb0 = maxf(minf(rgb0, minValue), maxValue); \
rgb1 = maxf(minf(rgb1, minValue), maxValue); \
rgb2 = maxf(minf(rgb2, minValue), maxValue); \
rgb3 = maxf(minf(rgb3, minValue), maxValue); \
rgb4 = maxf(minf(rgb4, minValue), maxValue); \
rgb5 = maxf(minf(rgb5, minValue), maxValue); \
rgb6 = maxf(minf(rgb6, minValue), maxValue); \
rgb7 = maxf(minf(rgb7, minValue), maxValue); \
\
rgb0 = fpow(rgb0, rgbGamma_g); \
rgb1 = fpow(rgb1, rgbGamma_g); \
rgb2 = fpow(rgb2, rgbGamma_g); \
rgb3 = fpow(rgb3, rgbGamma_g); \
rgb4 = fpow(rgb4, rgbGamma_g); \
rgb5 = fpow(rgb5, rgbGamma_g); \
rgb6 = fpow(rgb6, rgbGamma_g); \
rgb7 = fpow(rgb7, rgbGamma_g); \
\
yuv0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb0); \
yuv1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb1); \
yuv2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb2); \
yuv3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb3); \
yuv4 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb4); \
yuv5 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb5); \
yuv6 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb6); \
yuv7 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb7); \
\
packed0 = addi(ftoi(yuv0), yuv2packed_add); \
packed1 = addi(ftoi(yuv1), yuv2packed_add); \
packed2 = addi(ftoi(yuv2), yuv2packed_add); \
packed3 = addi(ftoi(yuv3), yuv2packed_add); \
packed4 = addi(ftoi(yuv4), yuv2packed_add); \
packed5 = addi(ftoi(yuv5), yuv2packed_add); \
packed6 = addi(ftoi(yuv6), yuv2packed_add); \
packed7 = addi(ftoi(yuv7), yuv2packed_add); \
\
packedStoreFunc(packed0, j, luma, cb, cr); \
packedStoreFunc(packed1, j+elementCount, luma, cb, cr); \
packedStoreFunc(packed2, j+elementCount*2, luma, cb, cr); \
packedStoreFunc(packed3, j+elementCount*3, luma, cb, cr); \
packedStoreFunc(packed4, j+elementCount*4, luma, cb, cr); \
packedStoreFunc(packed5, j+elementCount*5, luma, cb, cr); \
packedStoreFunc(packed6, j+elementCount*6, luma, cb, cr); \
packedStoreFunc(packed7, j+elementCount*7, luma, cb, cr); \
} \
};
#define typed_func(platform, floatType, elementCount, func) typed_func_##platform##_##func(floatType, elementCount)
#define typed_vector_type(platform, floatType, elementCount) typed_vector_type_##platform(floatType, elementCount)
#define typed_vector_int_type(platform, floatType, elementCount) typed_vector_int_type_##platform(floatType, elementCount)
#define typed_vector_int_size(platform, floatType, elementCount) typed_vector_int_size_##platform##_##floatType##_##elementCount
#define typed_vector_int_size(platform, floatType, elementCount) typed_vector_int_size_##platform##_##floatType##_##elementCount
#define typed_vector_int_func(platform, floatType, elementCount) typed_vector_int_funcsize_##platform##_##floatType##_##elementCount
#define typed_converter_pumps(pumps) concat(typed_converter_pump, pumps)
#define typed_converter_platform(platform, pumps, floatType, elementCount, hasFMA, packedLoadFunc, packedStoreFunc) \
typed_converter_pumps(pumps)(floatType, \
typed_vector_type(platform, floatType, elementCount), \
typed_vector_int_type(platform, floatType, elementCount), \
elementCount, \
typed_func(platform, floatType, elementCount, set1), \
typed_func(platform, floatType, elementCount, set), \
typed_func(platform, floatType, elementCount, seti), \
typed_func(platform, floatType, elementCount, fmadd), \
typed_func(platform, floatType, elementCount, mul), \
typed_func(platform, floatType, elementCount, add), \
hasFMA, \
typed_func(platform, floatType, elementCount, permute_lanes), \
typed_func(platform, floatType, elementCount, i32gather), \
typed_func(platform, floatType, elementCount, min), \
typed_func(platform, floatType, elementCount, max), \
typed_func(platform, floatType, elementCount, addi), \
typed_func(platform, floatType, elementCount, ftoi), \
typed_func(platform, floatType, elementCount, pow1), \
packedLoadFunc, \
packedStoreFunc \
)

58
conv/conv_generic.h Normal file
View file

@ -0,0 +1,58 @@
#define xyz12_to_linear_name xyz12_to_linear_
#define xyz12_to_linear_type(fType) concat(xyz12_to_linear_name, fType)
fType xyz12_to_linear_type(fType)[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64)));
#define clamp(v, min, max) (v < min ? min : (v > max ? max : v))
#define M(m, i, j) (m[i*3+j])
#define mxv_step_name mxv_step_
#define _line_name convert_line_dci_xyz12_to_yuv16_
inline __attribute__((always_inline)) fType concat(mxv_step_name, fType) (const double* matrix, int step, fType a, fType b, fType c) {
return M(matrix, step, 0)*a + M(matrix, step, 1)*b + M(matrix, step, 2)*c;
}
inline __attribute__((always_inline)) void concat(_line_name, fType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) {
fType xyz[3] __attribute__((aligned(32)));
fType rgb[3] __attribute__((aligned(32)));
fType yuv[3] __attribute__((aligned(32)));
const fType rgbGamma_l = rgbGamma_g;
for (int j = 0; j < width; j++) {
xyz[0] = xyz12_to_linear_type(fType)[in[j*3] << (XYZ_LOOKUP_TABLE_SIZE - 12)];
xyz[1] = xyz12_to_linear_type(fType)[in[j*3+1] << (XYZ_LOOKUP_TABLE_SIZE - 12)];
xyz[2] = xyz12_to_linear_type(fType)[in[j*3+2] << (XYZ_LOOKUP_TABLE_SIZE - 12)];
rgb[0] = concat(mxv_step_name, fType)(xyz2rgb_g, 0, xyz[0], xyz[1], xyz[2]);
rgb[1] = concat(mxv_step_name, fType)(xyz2rgb_g, 1, xyz[0], xyz[1], xyz[2]);
rgb[2] = concat(mxv_step_name, fType)(xyz2rgb_g, 2, xyz[0], xyz[1], xyz[2]);
rgb[0] = clamp(rgb[0], 0., 1.);
rgb[1] = clamp(rgb[1], 0., 1.);
rgb[2] = clamp(rgb[2], 0., 1.);
// todo: optimize this via https://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent ?
if (sizeof(fType) == 4){
rgb[0] = powf(rgb[0], rgbGamma_l);
rgb[1] = powf(rgb[1], rgbGamma_l);
rgb[2] = powf(rgb[2], rgbGamma_l);
} else if (sizeof(fType) == 8){
rgb[0] = pow(rgb[0], rgbGamma_l);
rgb[1] = pow(rgb[1], rgbGamma_l);
rgb[2] = pow(rgb[2], rgbGamma_l);
}
yuv[0] = concat(mxv_step_name, fType) (rgb2yuv_g, 0, rgb[0], rgb[1], rgb[2]);
yuv[1] = concat(mxv_step_name, fType) (rgb2yuv_g, 1, rgb[0], rgb[1], rgb[2]);
yuv[2] = concat(mxv_step_name, fType) (rgb2yuv_g, 2, rgb[0], rgb[1], rgb[2]);
luma[j] = clamp((int)(round(yuv[0])), 0, uint16Max);
cb[j] = clamp((int)(round(yuv[1])) + int16MaxPlusOne, 0, uint16Max);
cr[j] = clamp((int)(round(yuv[2])) + int16MaxPlusOne, 0, uint16Max);
}
}

1
conv/simd_utils Submodule

@ -0,0 +1 @@
Subproject commit e0aa01336b63d0c9d351a09dc24e0b22483219ad

80
convert.go Normal file
View file

@ -0,0 +1,80 @@
package main
import (
"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/colorspace"
"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/conv"
"gonum.org/v1/gonum/mat"
)
var useCConverter, useFloat bool
func ConvertFrame(in []uint32, y, cb, cr []uint16, width, height int) {
if useCConverter {
if useFloat {
conv.ConvertFrameDCIXYZToYUV16Float(in, y, cb, cr, width, height)
} else {
conv.ConvertFrameDCIXYZToYUV16Double(in, y, cb, cr, width, height)
}
return
}
for i := 0; i < height; i++ {
ConvertLine(in, y, cb, cr, width, height)
in = in[width*3:]
y = y[width:]
cb = cb[width:]
cr = cr[width:]
}
}
func ConvertLine(in []uint32, y, cb, cr []uint16, width, height int) {
if useCConverter {
if useFloat {
conv.ConvertLineDCIXYZToYUV16Float(in, y, cb, cr, width, height)
} else {
conv.ConvertLineDCIXYZToYUV16Double(in, y, cb, cr, width, height)
}
return
}
var r, g, b float64
xyz, rgb, yuv := mat.NewVecDense(3, nil), mat.NewVecDense(3, nil), mat.NewVecDense(3, nil)
for j := 0; j < width; j++ {
// LUT
xyz.SetVec(0, colorspace.DCIXYZSystem.ToLinearLUT[uint16(in[j*3])<<4])
xyz.SetVec(1, colorspace.DCIXYZSystem.ToLinearLUT[uint16(in[j*3+1])<<4])
xyz.SetVec(2, colorspace.DCIXYZSystem.ToLinearLUT[uint16(in[j*3+2])<<4])
//TODO: apply white point correction here if necessary (but after denorm)
//denormalize + xyz2rgb
rgb.MulVec(xyz2rgbDenorm, xyz)
//todo: some out of bounds r,g,b come up from here, maybe just fine
//clamp values into proper values. necessary due to XYZ ranges, todo: check conversion matrix for preserving this
r = min(1.0, max(0, rgb.AtVec(0)))
g = min(1.0, max(0, rgb.AtVec(1)))
b = min(1.0, max(0, rgb.AtVec(2)))
// companding / adjustment with gamma curve
//TODO: why is it not using normal Rec709 and instead using straight gamma curve
r = space.FromLinear(r)
g = space.FromLinear(g)
b = space.FromLinear(b)
rgb.SetVec(0, r)
rgb.SetVec(1, g)
rgb.SetVec(2, b)
yuv.MulVec(rgb2yuv, rgb)
// map RGB to components
// scale float range to 16bit precision, in full swing
y[j] = LumaToFull16(yuv.AtVec(0))
cb[j] = ChromaToFull16(yuv.AtVec(1))
cr[j] = ChromaToFull16(yuv.AtVec(2))
}
}

5
go.mod Normal file
View file

@ -0,0 +1,5 @@
module git.gammaspectra.live/WeebDataHoarder/xyz2yuv
go 1.21
require gonum.org/v1/gonum v0.14.0

4
go.sum Normal file
View file

@ -0,0 +1,4 @@
golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
golang.org/x/exp v0.0.0-20230321023759-10a507213a29/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0=
gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU=

27
job.go Normal file
View file

@ -0,0 +1,27 @@
package main
import (
"sync"
)
type frameJobData struct {
wg *sync.WaitGroup
frame int
height int
width int
in []uint32
y []uint16
cb []uint16
cr []uint16
}
func (job frameJobData) Process() {
defer job.wg.Done()
ConvertFrame(job.in, job.y, job.cb, job.cr, job.width, job.height)
}
type decodedFrame struct {
Number int
Frame []uint32
}

102
libav/libav.go Normal file
View file

@ -0,0 +1,102 @@
package libav
/*
#cgo pkg-config: libavformat libavcodec libavutil
#include <libavformat/avformat.h>
*/
import "C"
import (
"errors"
"io"
"slices"
"unsafe"
)
type PacketData struct {
Number int
Data []byte
}
func OpenXYZ12(inputFile string, initFunc func(framerateNum, framerateDen, sarNum, sarDen, width, height int) error, packetFunc func(packet PacketData) error) error {
var fmtCtx *C.AVFormatContext
//Open file and decoder
inputFileC := C.CString(inputFile)
defer C.free(unsafe.Pointer(inputFileC))
ret := C.avformat_open_input(&fmtCtx, inputFileC, nil, nil)
if ret < 0 {
panic("cannot open input file")
}
defer C.avformat_close_input(&fmtCtx)
ret = C.avformat_find_stream_info(fmtCtx, nil)
if ret < 0 {
panic("cannot find stream information")
}
//get video
ret = C.av_find_best_stream(fmtCtx, C.AVMEDIA_TYPE_VIDEO, -1, -1, nil, 0)
if ret < 0 {
panic("cannot find video stream")
}
videoStreamIndex := ret
inputStream := unsafe.Slice(fmtCtx.streams, fmtCtx.nb_streams)[videoStreamIndex]
codecPar := inputStream.codecpar
if codecPar.codec_id != C.AV_CODEC_ID_JPEG2000 {
panic("video codec not JPEG2000")
}
if codecPar.format != C.AV_PIX_FMT_XYZ12LE {
panic("video format not xyz12le")
}
codecPar.color_range = C.AVCOL_RANGE_JPEG
codecPar.color_primaries = C.AVCOL_PRI_SMPTE431
//codecPar.color_trc = C.AVCOL_PRI_SMPTE431
//codecPar.color_space = C.AVCOL_PRI_SMPTE431
var packet *C.AVPacket
packet = C.av_packet_alloc()
if packet == nil {
panic("err allocating")
}
defer C.av_packet_free(&packet)
err := initFunc(int(inputStream.codec.framerate.num), int(inputStream.codec.framerate.den), int(inputStream.sample_aspect_ratio.num), int(inputStream.sample_aspect_ratio.den), int(codecPar.width), int(codecPar.height))
if err != nil {
return err
}
var frameNumber int
for {
ret = C.av_read_frame(fmtCtx, packet)
if ret < 0 {
break
}
if packet.stream_index == videoStreamIndex {
err = packetFunc(PacketData{
Number: frameNumber,
Data: slices.Clone(unsafe.Slice((*byte)(packet.data), int(packet.size))),
})
if err != nil {
if errors.Is(err, io.EOF) {
break
}
return err
}
frameNumber++
}
C.av_packet_unref(packet)
}
return nil
}

141
libopenjp2/libopenjp2.go Normal file
View file

@ -0,0 +1,141 @@
package libopenjp2
/*
#cgo pkg-config: libopenjp2
#include "libopenjp2.h"
*/
import "C"
import (
"encoding/binary"
"errors"
"fmt"
"runtime"
"slices"
"unsafe"
)
//FFMpeg removed libopenjpeg decoder https://github.com/FFmpeg/FFmpeg/commit/60ccb3fe787be3bb10fc4545b3593cd1e0b769ed
const Jp2SigType = 0x6A502020
const Jp2SigValue = 0x0D0A870A
type Jpeg2000Decoder struct {
ctx C.opj_dparameters_t
}
const QualityLayersAll = 0
const ResolutionLayersAll = 0
func NewJpeg2000Decoder(qualityLayers, resolutionLayers uint) (*Jpeg2000Decoder, error) {
d := &Jpeg2000Decoder{}
C.opj_set_default_decoder_parameters(&d.ctx)
d.ctx.cp_layer = C.uint32_t(qualityLayers)
d.ctx.cp_reduce = C.uint32_t(resolutionLayers)
return d, nil
}
var jp2c = binary.LittleEndian.Uint32([]byte("jp2c"))
type Jpeg2000Frame struct {
Width, Height int
X, Y, Z []uint32
}
func (d *Jpeg2000Decoder) DecodeFrame(buf []byte) (frame *Jpeg2000Frame, err error) {
var image *C.opj_image_t
var dec *C.opj_codec_t
var stream *C.opj_stream_t
var pinner runtime.Pinner
pinner.Pin(unsafe.Pointer(unsafe.SliceData(buf)))
defer pinner.Unpin()
// Check if input is a raw jpeg2k codestream or in jp2 wrapping
if (binary.LittleEndian.Uint32(buf) == 12) &&
(binary.LittleEndian.Uint32(buf[4:]) == Jp2SigType) &&
(binary.LittleEndian.Uint32(buf[8:]) == Jp2SigValue) {
dec = C.opj_create_decompress(C.OPJ_CODEC_JP2)
} else {
/* If the AVPacket contains a jp2c box, then skip to
* the starting byte of the codestream. */
if binary.LittleEndian.Uint32(buf[4:]) == jp2c {
buf = buf[8:]
}
dec = C.opj_create_decompress(C.OPJ_CODEC_J2K)
}
if dec == nil {
return nil, errors.New("error initializing decoder")
}
defer C.opj_destroy_codec(dec)
// Tie decoder with decoding parameters
C.opj_setup_decoder(dec, &d.ctx)
stream = C.opj_stream_default_create(C.OPJ_STREAM_READ)
defer C.opj_stream_destroy(stream)
if stream == nil {
return nil, errors.New("error initializing stream")
}
reader := &C.BufferReader{
pos: 0,
size: C.int(len(buf)),
buffer: (*C.uchar)(unsafe.Pointer(unsafe.SliceData(buf))),
}
pinner.Pin(reader)
C.set_stream_callbacks(stream)
C.opj_stream_set_user_data(stream, unsafe.Pointer(reader), nil)
C.opj_stream_set_user_data_length(stream, C.ulong(len(buf)))
ret := C.opj_read_header(stream, dec, &image)
defer C.opj_image_destroy(image)
if ret != 1 {
return nil, errors.New("error decoding stream header")
}
if image.numcomps != 3 {
return nil, fmt.Errorf("unexpected component number %d", image.numcomps)
}
components := unsafe.Slice(image.comps, int(image.numcomps))
for i, c := range components {
if c.prec != 12 {
return nil, fmt.Errorf("unexpected component %d bit depth %d", i, c.prec)
}
}
ret = C.opj_decode(dec, stream, image)
if ret != 1 {
return nil, errors.New("error decoding image")
}
for i, c := range components {
if c.data == nil {
return nil, fmt.Errorf("component %d has no data", i)
}
}
readComponent := func(index int) []uint32 {
return unsafe.Slice((*uint32)(unsafe.Pointer(components[index].data)), int(components[index].h)*int(components[index].w))
}
frame = &Jpeg2000Frame{
Width: int(components[0].w),
Height: int(components[0].h),
X: slices.Clone(readComponent(0)),
Y: slices.Clone(readComponent(1)),
Z: slices.Clone(readComponent(2)),
}
return frame, nil
}

67
libopenjp2/libopenjp2.h Normal file
View file

@ -0,0 +1,67 @@
#include <string.h>
#include <openjpeg.h>
typedef struct BufferReader {
int pos;
int size;
const uint8_t *buffer;
} BufferReader;
static OPJ_SIZE_T stream_read(void *out_buffer, OPJ_SIZE_T nb_bytes, void *user_data) {
BufferReader *reader = user_data;
int remaining;
if (reader->pos == reader->size) {
return (OPJ_SIZE_T)-1;
}
remaining = reader->size - reader->pos;
if (nb_bytes > remaining) {
nb_bytes = remaining;
}
memcpy(out_buffer, reader->buffer + reader->pos, nb_bytes);
reader->pos += (int)nb_bytes;
return nb_bytes;
}
static OPJ_OFF_T stream_skip(OPJ_OFF_T nb_bytes, void *user_data)
{
BufferReader *reader = user_data;
if (nb_bytes < 0) {
if (reader->pos == 0) {
return (OPJ_SIZE_T)-1;
}
if (nb_bytes + reader->pos < 0) {
nb_bytes = -reader->pos;
}
} else {
int remaining;
if (reader->pos == reader->size) {
return (OPJ_SIZE_T)-1;
}
remaining = reader->size - reader->pos;
if (nb_bytes > remaining) {
nb_bytes = remaining;
}
}
reader->pos += (int)nb_bytes;
return nb_bytes;
}
static OPJ_BOOL stream_seek(OPJ_OFF_T nb_bytes, void *user_data)
{
BufferReader *reader = user_data;
if (nb_bytes < 0 || nb_bytes > reader->size) {
return OPJ_FALSE;
}
reader->pos = (int)nb_bytes;
return OPJ_TRUE;
}
static void set_stream_callbacks(opj_stream_t* stream) {
opj_stream_set_read_function(stream, stream_read);
opj_stream_set_skip_function(stream, stream_skip);
opj_stream_set_seek_function(stream, stream_seek);
}

29
parameters.go Normal file
View file

@ -0,0 +1,29 @@
package main
import (
"gonum.org/v1/gonum/mat"
"math"
)
func LumaToFull16(l float64) uint16 {
return uint16(min(math.MaxUint16, max(0, int(math.Round(l*math.MaxUint16)))))
}
func ChromaToFull16(c float64) uint16 {
return uint16(min(math.MaxUint16, max(0, int(math.Round(c*math.MaxUint16+math.MaxInt16+1)))))
}
// RoundMatToPrecision Applies rounding to each Matrix entry to limit precision
func RoundMatToPrecision(m *mat.Dense, decimals int) *mat.Dense {
var o mat.Dense
if decimals <= 0 {
o.CloneFrom(m)
return &o
}
factor := 10. * float64(decimals)
o.Apply(func(i, j int, v float64) float64 {
return math.Round(v*factor) / factor
}, m)
return &o
}

5
scripts/README.md Normal file
View file

@ -0,0 +1,5 @@
# Script collection for playback
```shell
$ [script].sh "input.mkv"
```

14
scripts/playback-mpv.sh Executable file
View file

@ -0,0 +1,14 @@
#!/bin/bash
# Playbacks an XYZ input file straight via mpv
cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
../bin/xyz2yuv \
-in "${1}" \
-colorspace "${2}" \
-out - | mpv \
--demuxer-max-bytes=4096MiB --cache=yes --cache-secs=30 \
--force-seekable=yes - \
--external-file="${1}" \
--vid=1 --aid=1

7
scripts/playback-rec709-mpv.sh Executable file
View file

@ -0,0 +1,7 @@
#!/bin/bash
# Playbacks an XYZ input file straight via mpv
cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
./playback-mpv.sh "${1}" rec709_pure

View file

@ -0,0 +1,7 @@
#!/bin/bash
# Playbacks an XYZ input file straight via mpv
cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
./playback-mpv.sh "${1}" rec709_pure22

View file

@ -0,0 +1,7 @@
#!/bin/bash
# Playbacks an XYZ input file straight via mpv
cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
./playback-mpv.sh "${1}" rec709_pure24

427
xyz2yuv.go Normal file
View file

@ -0,0 +1,427 @@
package main
import "C"
import (
"crypto/sha256"
"encoding/hex"
"flag"
"fmt"
"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/colorspace"
"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/conv"
"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/libav"
"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/libopenjp2"
"gonum.org/v1/gonum/mat"
"io"
"math"
"os"
"runtime"
"slices"
"strings"
"sync"
"sync/atomic"
"time"
"unsafe"
)
var space colorspace.RelativeSystem
var xyz2rgb *mat.Dense
var xyz2rgbDenorm *mat.Dense
var rgb2yuv *mat.Dense
var rgb2yuvPremultiplied *mat.Dense
var rgbGamma float64
func ToPacked(a, b, c []uint32, extra int) []uint32 {
if len(a) != len(b) || len(a) != len(c) {
panic("lengths mismatch")
}
out := make([]uint32, len(a)*3, len(a)*3+extra)
for i := range a {
out[i*3] = a[i]
out[i*3+1] = b[i]
out[i*3+2] = c[i]
}
return out
}
func main() {
inFile := flag.String("in", "", "Input file")
startFrame := flag.Uint64("start", 0, "Start frame number inclusive")
endFrame := flag.Uint64("end", math.MaxUint64, "End frame number exclusive")
outFile := flag.String("out", "-", "Output file. Use - for stdout")
colorspaceRelativeSystem := flag.String("colorspace", "rec709_pure", "Colorspace and parameters to convert into. Supported: rec709, rec709_pure, rec709_pure22, rec709_pure24, rec2020, rec2020_pure, rec2020_pure24")
xyzPrecision := flag.Int("precision-xyz2rgb", 0, "XYZ -> RGB conversion matrix precision. 0 = maximum")
rgbPrecision := flag.Int("precision-rgb2yuv", 0, "RGB -> YUV conversion matrix precision. 0 = maximum")
lowres := flag.Uint("lowres", 0, "Feed lowres parameter. Default is full frame")
useFloatPipeline := flag.Bool("float", false, "Use float pipeline instead of double, although less precise. Very fast.")
useGoPipeline := flag.Bool("use-go-pipeline", false, "Use Go pipeline, although slower. Does not support float mode.")
hashOutput := flag.Bool("hash", false, "Hash with SHA256 each output frame for accuracy comparisons")
decoderThreads := flag.Uint("decoder-threads", 0, "Threads for JPEG2000 decoding. Defaults to number of logical CPU")
pipelineThreads := flag.Uint("pipeline-threads", 0, "Threads for colorspace conversion pipeline. Defaults to number of logical CPU")
flag.Parse()
runtime.KeepAlive(endFrame)
//C.av_log_set_level(C.AV_LOG_DEBUG)
numDecoderCpu := int(*decoderThreads)
if numDecoderCpu == 0 {
numDecoderCpu = runtime.NumCPU()
}
numPipelineCpu := int(*pipelineThreads)
if numPipelineCpu == 0 {
numPipelineCpu = runtime.NumCPU()
}
useCConverter = !*useGoPipeline
useFloat = *useFloatPipeline
switch strings.ToLower(*colorspaceRelativeSystem) {
case "rec709":
space = colorspace.SystemRec709
rgbGamma = colorspace.GammaRec709
case "rec709_pure":
space = colorspace.SystemRec709_Pure
rgbGamma = colorspace.GammaRec709
case "rec709_pure22":
space = colorspace.SystemRec709_Pure22
rgbGamma = colorspace.Gamma22
case "rec709_pure24":
space = colorspace.SystemRec709_Pure24
rgbGamma = colorspace.Gamma24
case "rec2020":
space = colorspace.SystemRec2020
rgbGamma = colorspace.GammaRec2020
case "rec2020_pure":
space = colorspace.SystemRec2020_Pure
rgbGamma = colorspace.GammaRec2020
case "rec2020_pure24":
space = colorspace.SystemRec2020_Pure24
rgbGamma = colorspace.Gamma24
default:
panic("unsupported colorspace")
}
_, xyz2rgb = space.Chromaticity.ConversionXYZ()
_, rgb2yuv = space.YCbCr.ConversionRGB()
//adjust xyz2rgb with normalization factor from DCI
denorm := mat.NewDiagDense(3, []float64{
1 / colorspace.DCINormalizationFactor,
1 / colorspace.DCINormalizationFactor,
1 / colorspace.DCINormalizationFactor,
})
xyz2rgbDenorm = mat.NewDense(3, 3, nil)
xyz2rgbDenorm.Mul(denorm, xyz2rgb)
premult := mat.NewDiagDense(3, []float64{
math.MaxUint16,
math.MaxUint16,
math.MaxUint16,
})
rgb2yuvPremultiplied = mat.NewDense(3, 3, nil)
rgb2yuvPremultiplied.Mul(rgb2yuv, premult)
xyz2rgb = RoundMatToPrecision(xyz2rgb, *xyzPrecision)
rgb2yuv = RoundMatToPrecision(rgb2yuv, *rgbPrecision)
xyz2rgbDenorm = RoundMatToPrecision(xyz2rgbDenorm, *xyzPrecision)
rgb2yuvPremultiplied = RoundMatToPrecision(rgb2yuvPremultiplied, *rgbPrecision)
_, _ = fmt.Fprintf(os.Stderr, "\nXYZ to RGB matrix:\n%v\n\n", mat.Formatted(xyz2rgb))
_, _ = fmt.Fprintf(os.Stderr, "\nXYZ to RGB matrix (denormalized):\n%v\n\n", mat.Formatted(xyz2rgbDenorm))
_, _ = fmt.Fprintf(os.Stderr, "\nRGB to YUV matrix:\n%v\n\n", mat.Formatted(rgb2yuv))
_, _ = fmt.Fprintf(os.Stderr, "\nRGB to YUV matrix (premultiplied):\n%v\n\n", mat.Formatted(rgb2yuvPremultiplied))
if useCConverter {
_, _ = fmt.Fprintf(os.Stderr, "\nDecoder: CGO %s\n", conv.DecoderInformation())
} else {
_, _ = fmt.Fprintf(os.Stderr, "\nDecoder: Go Generic scalar pipeline (1d 1f)\n")
}
if useFloat && useCConverter {
_, _ = fmt.Fprintf(os.Stderr, "Data type: float32\n\n")
} else {
_, _ = fmt.Fprintf(os.Stderr, "Data type: float64\n\n")
}
if useCConverter {
conv.InitData(xyz2rgbDenorm, rgb2yuvPremultiplied, colorspace.GammaDCIXYZ, rgbGamma)
}
//open and write output file header
var output *os.File
if *outFile == "-" {
output = os.Stdout
} else {
f, err := os.Create(*outFile)
if err != nil {
panic(err)
}
output = f
}
defer output.Close()
outputFrame := func(number int, y, cb, cr []uint16) error {
by := unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(y))), len(y)*2)
bcb := unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(cb))), len(cb)*2)
bcr := unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(cr))), len(cr)*2)
if *hashOutput {
hasher := sha256.New()
hasher.Write(by)
hasher.Write(bcb)
hasher.Write(bcr)
fmt.Fprintf(os.Stderr, "\r%s\n", hex.EncodeToString(hasher.Sum(nil)))
//fmt.Fprintf(os.Stderr, "\rFrame %d: %s\n", number, hex.EncodeToString(hasher.Sum(nil)))
}
_, err := output.WriteString("FRAME\n")
if err != nil {
return err
}
_, err = output.Write(by)
if err != nil {
return err
}
_, err = output.Write(bcb)
if err != nil {
return err
}
_, err = output.Write(bcr)
if err != nil {
return err
}
return nil
}
// decode and processing loop
var wg sync.WaitGroup
availableFrames := make(chan *frameJobData, numPipelineCpu*2)
inFrameJobs := make(chan *frameJobData, numPipelineCpu)
outFrameJobs := make(chan *frameJobData, numPipelineCpu)
availableDecoders := make(chan struct{}, numDecoderCpu*2)
outDecoderJobs := make(chan *decodedFrame, numPipelineCpu)
jpegDecoderChannel := make(chan libav.PacketData, numDecoderCpu)
var expectedFrame = max(0, *startFrame)
var expectedFrameDecoder = expectedFrame
var processedFrames atomic.Uint64
var firstFrame = expectedFrame
var firstFrameTime time.Time
var wg2 sync.WaitGroup
for i := 0; i < numPipelineCpu; i++ {
wg2.Add(1)
go func() {
defer wg2.Done()
for job := range inFrameJobs {
job.Process()
processedFrames.Add(1)
outFrameJobs <- job
}
}()
}
wg.Add(1)
go func() {
defer wg.Done()
wg2.Wait()
close(outFrameJobs)
}()
wg.Add(1)
go func() {
defer wg.Done()
outputs := make([]*frameJobData, 0)
for out := range outFrameJobs {
outputs = append(outputs, out)
slices.SortFunc(outputs, func(a, b *frameJobData) int {
return a.frame - b.frame
})
for len(outputs) > 0 {
f := outputs[0]
if f.frame != int(expectedFrame) {
break
}
//output frame to file
err := outputFrame(f.frame, f.y, f.cb, f.cr)
if err != nil {
panic(err)
}
outputs = slices.Delete(outputs, 0, 1)
expectedFrame++
availableFrames <- f
}
}
}()
decoder, err := libopenjp2.NewJpeg2000Decoder(libopenjp2.QualityLayersAll, *lowres)
if err != nil {
panic(err)
}
var streamFramerateNum, streamFramerateDen, streamSarNum, streamSarDen int
var onceInit sync.Once
var wgDecoder sync.WaitGroup
for i := 0; i < numDecoderCpu*2; i++ {
wg.Add(1)
wgDecoder.Add(1)
go func() {
defer wg.Done()
defer wgDecoder.Add(1)
for p := range jpegDecoderChannel {
frame, err := decoder.DecodeFrame(p.Data)
if err != nil {
panic(err)
}
onceInit.Do(func() {
_, err := output.WriteString(fmt.Sprintf("YUV4MPEG2 W%d H%d F%d:%d I%s A%d:%d%s%s\n",
frame.Width,
frame.Height,
streamFramerateNum,
streamFramerateDen,
"p",
streamSarNum,
streamSarDen,
" C444p16 XYSCSS=444P16",
" XCOLORRANGE=FULL",
))
if err != nil {
panic(err)
}
yuvLineSize := frame.Width
yuvFrameSize := frame.Height * yuvLineSize
for i := 0; i < numPipelineCpu*2; i++ {
availableFrames <- &frameJobData{
wg: &wg,
frame: 0,
width: frame.Width,
height: frame.Height,
in: nil,
//add extra capacity for OOB writes in ASM code
y: make([]uint16, yuvFrameSize, yuvFrameSize+64),
cb: make([]uint16, yuvFrameSize, yuvFrameSize+64),
cr: make([]uint16, yuvFrameSize, yuvFrameSize+64),
}
}
firstFrameTime = time.Now().UTC()
})
outDecoderJobs <- &decodedFrame{
Number: p.Number,
//add extra capacity for OOB reads in ASM code
Frame: ToPacked(frame.X, frame.Y, frame.Z, 256),
}
}
}()
}
wg.Add(1)
go func() {
defer wg.Done()
defer close(inFrameJobs)
outputs := make([]*decodedFrame, 0)
for out := range outDecoderJobs {
outputs = append(outputs, out)
slices.SortFunc(outputs, func(a, b *decodedFrame) int {
return a.Number - b.Number
})
for len(outputs) > 0 {
frame := outputs[0]
if frame.Number != int(expectedFrameDecoder) {
break
}
f := <-availableFrames
f.frame = frame.Number
//f.inLineSize = linesize
f.in = frame.Frame
wg.Add(1)
inFrameJobs <- f
outputs = slices.Delete(outputs, 0, 1)
expectedFrameDecoder++
availableDecoders <- struct{}{}
}
}
}()
go func() {
for range time.Tick(time.Second) {
frame := int(processedFrames.Load())
runningTime := time.Now().UTC().Sub(firstFrameTime)
fps := float64(frame-int(firstFrame)+1) / runningTime.Seconds()
_, _ = fmt.Fprintf(os.Stderr, "\rFrames %d %.02f fps %s ", frame, fps, runningTime.Truncate(time.Second))
}
}()
err = libav.OpenXYZ12(*inFile, func(framerateNum, framerateDen, sarNum, sarDen, width, height int) error {
streamFramerateNum = framerateNum
streamFramerateDen = framerateDen
streamSarNum = sarNum
streamSarDen = sarDen
for i := 0; i < numDecoderCpu*2; i++ {
availableDecoders <- struct{}{}
}
return nil
}, func(p libav.PacketData) error {
if uint64(p.Number) < *startFrame {
firstFrameTime = time.Now().UTC()
return nil
}
if uint64(p.Number) >= *endFrame {
return io.EOF
}
<-availableDecoders
jpegDecoderChannel <- p
return nil
})
if err != nil {
panic(err)
}
close(jpegDecoderChannel)
wgDecoder.Wait()
close(outDecoderJobs)
wg.Wait()
print("\n\n")
runningTime := time.Now().UTC().Sub(firstFrameTime)
fps := float64(int(processedFrames.Load())-int(firstFrame)+1) / runningTime.Seconds()
_, _ = fmt.Fprintf(os.Stderr, "\nTotal %d frames, %.02f fps, took %s \n", processedFrames.Load(), fps, runningTime.Truncate(time.Millisecond))
}