Initial commit
This commit is contained in:
commit
6eda53859e
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
/bin/*
|
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
[submodule "conv/simd_utils"]
|
||||
path = conv/simd_utils
|
||||
url = https://github.com/JishinMaster/simd_utils.git
|
19
LICENSE
Normal file
19
LICENSE
Normal file
|
@ -0,0 +1,19 @@
|
|||
Copyright (c) 2024 WeebDataHoarder, xyz2yuv Contributors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
11
README.md
Normal file
11
README.md
Normal file
|
@ -0,0 +1,11 @@
|
|||
# xyz2yuv tool
|
||||
Decode DCI XYZ' and apply conversions to your desired output colorspace. Supports Rec. 709 and Rec. 2020, with adjustable precision and gamma values.
|
||||
|
||||
Supports AVX-512 and AVX2 targets, and a generic implementation in C and Go as well.
|
||||
|
||||
## Dependencies
|
||||
* CGO
|
||||
* libopenjp2-dev
|
||||
* libavformat-dev
|
||||
* libavcodec-dev
|
||||
* libavutil-dev
|
0
bin/.gitkeep
Normal file
0
bin/.gitkeep
Normal file
14
build.sh
Executable file
14
build.sh
Executable file
|
@ -0,0 +1,14 @@
|
|||
#!/bin/bash
|
||||
|
||||
cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
|
||||
|
||||
CMD=go
|
||||
|
||||
if [[ "${GOROOT}" != "" ]]; then
|
||||
CMD="${GOROOT}/bin/go"
|
||||
fi
|
||||
|
||||
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv ./
|
||||
CGO_CFLAGS="-DSIMD_PUMPS=2" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv_2pump ./
|
||||
CGO_CFLAGS="-DSIMD_PUMPS=4" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv_4pump ./
|
||||
CGO_CFLAGS="-DSIMD_PUMPS=8" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 "${CMD}" build -v -buildvcs=false -trimpath -gcflags=all="-l" -ldflags="-s -w -buildid=" -o bin/xyz2yuv_8pump ./
|
53
colorspace/adaptation.go
Normal file
53
colorspace/adaptation.go
Normal file
|
@ -0,0 +1,53 @@
|
|||
package colorspace
|
||||
|
||||
import "gonum.org/v1/gonum/mat"
|
||||
|
||||
type ChromaticAdaptation mat.Dense
|
||||
|
||||
func (a ChromaticAdaptation) AdaptXYZ(from, to Illuminant) mat.Matrix {
|
||||
var fromM, toM *mat.VecDense
|
||||
|
||||
{
|
||||
Xw, Yw, Zw := from.ToXYZ()
|
||||
fromM = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
|
||||
}
|
||||
|
||||
{
|
||||
Xw, Yw, Zw := to.ToXYZ()
|
||||
toM = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
|
||||
}
|
||||
|
||||
var crdFrom, crdTo mat.VecDense
|
||||
|
||||
crdFrom.MulVec((*mat.Dense)(&a), fromM)
|
||||
crdTo.MulVec((*mat.Dense)(&a), toM)
|
||||
|
||||
return mat.NewDiagDense(3, []float64{
|
||||
crdTo.AtVec(0) / crdFrom.AtVec(0),
|
||||
crdTo.AtVec(1) / crdFrom.AtVec(1),
|
||||
crdTo.AtVec(2) / crdFrom.AtVec(2),
|
||||
})
|
||||
}
|
||||
|
||||
var (
|
||||
ChromaticAdaptationBradford = mat.NewDense(3, 3, []float64{
|
||||
0.8951, 0.2664, -0.1614,
|
||||
-0.7502, 1.7135, 0.0367,
|
||||
0.0389, -0.0685, 1.0296,
|
||||
})
|
||||
ChromaticAdaptationCMCCAT2000 = mat.NewDense(3, 3, []float64{
|
||||
0.7982, 0.3389, -0.1371,
|
||||
-0.5918, 1.5512, 0.0406,
|
||||
0.0008, 0.0239, 0.9753,
|
||||
})
|
||||
ChromaticAdaptationCIECAT02 = mat.NewDense(3, 3, []float64{
|
||||
0.7328, 0.4296, -0.1624,
|
||||
-0.7036, 1.6975, 0.0061,
|
||||
0.0030, 0.0136, 0.9834,
|
||||
})
|
||||
ChromaticAdaptationSharp = mat.NewDense(3, 3, []float64{
|
||||
1.2694, -0.0988, -0.1706,
|
||||
-0.8364, 1.8006, 0.0357,
|
||||
0.0297, -0.0315, 1.0018,
|
||||
})
|
||||
)
|
145
colorspace/chromaticity.go
Normal file
145
colorspace/chromaticity.go
Normal file
|
@ -0,0 +1,145 @@
|
|||
package colorspace
|
||||
|
||||
import "gonum.org/v1/gonum/mat"
|
||||
|
||||
type Chromaticity struct {
|
||||
Red ColorCoordinate
|
||||
Green ColorCoordinate
|
||||
Blue ColorCoordinate
|
||||
White Illuminant
|
||||
}
|
||||
|
||||
func (c Chromaticity) ConversionXYZ() (to, from *mat.Dense) {
|
||||
var err error
|
||||
var RGB *mat.Dense
|
||||
var W *mat.VecDense
|
||||
|
||||
{
|
||||
Xr, Yr, Zr := c.Red.ToXYZ()
|
||||
Xg, Yg, Zg := c.Green.ToXYZ()
|
||||
Xb, Yb, Zb := c.Blue.ToXYZ()
|
||||
|
||||
RGB = mat.NewDense(3, 3, []float64{
|
||||
Xr, Xg, Xb,
|
||||
Yr, Yg, Yb,
|
||||
Zr, Zg, Zb,
|
||||
})
|
||||
}
|
||||
|
||||
{
|
||||
Xw, Yw, Zw := c.White.ToXYZ()
|
||||
|
||||
W = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
|
||||
}
|
||||
|
||||
var tmp1, rgb2xyz, xyz2rgb mat.Dense
|
||||
|
||||
var S mat.VecDense
|
||||
|
||||
if err = tmp1.Inverse(RGB); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
S.MulVec(&tmp1, W)
|
||||
|
||||
M := mat.NewDense(3, 3, []float64{
|
||||
S.AtVec(0) * RGB.At(0, 0), S.AtVec(1) * RGB.At(0, 1), S.AtVec(2) * RGB.At(0, 2),
|
||||
S.AtVec(0) * RGB.At(1, 0), S.AtVec(1) * RGB.At(1, 1), S.AtVec(2) * RGB.At(1, 2),
|
||||
S.AtVec(0) * RGB.At(2, 0), S.AtVec(1) * RGB.At(2, 1), S.AtVec(2) * RGB.At(2, 2),
|
||||
})
|
||||
|
||||
rgb2xyz.CloneFrom(M)
|
||||
|
||||
if err = xyz2rgb.Inverse(M); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return &rgb2xyz, &xyz2rgb
|
||||
}
|
||||
|
||||
/*
|
||||
func (c Chromaticity) XYZToRGB(connectionSpaceWhite Illuminant, adaptation ChromaticAdaptation) {
|
||||
var err error
|
||||
var RGB *mat.Dense
|
||||
var W1, W2 *mat.VecDense
|
||||
|
||||
{
|
||||
Xr, Yr, Zr := c.Red.ToXYZ()
|
||||
Xg, Yg, Zg := c.Green.ToXYZ()
|
||||
Xb, Yb, Zb := c.Blue.ToXYZ()
|
||||
|
||||
RGB = mat.NewDense(3, 3, []float64{
|
||||
Xr, Xg, Xb,
|
||||
Yr, Yg, Yb,
|
||||
Zr, Zg, Zb,
|
||||
})
|
||||
}
|
||||
|
||||
{
|
||||
Xw, Yw, Zw := c.White.ToXYZ()
|
||||
|
||||
W1 = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
|
||||
}
|
||||
|
||||
{
|
||||
Xw, Yw, Zw := connectionSpaceWhite.ToXYZ()
|
||||
|
||||
W2 = mat.NewVecDense(3, []float64{Xw, Yw, Zw})
|
||||
}
|
||||
|
||||
var tmp1, tmp2, M, Mc, M2, rgb2xyz, xyz2rgb, source_destination_whites, destination_source_whites, adapted_rgb2xyz_2, adapted_xyz2rgb_2 mat.Dense
|
||||
|
||||
var S, crdS, crdD, RA mat.VecDense
|
||||
|
||||
if err = tmp1.Inverse(RGB); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
S.MulVec(&tmp1, W1)
|
||||
|
||||
//TODO
|
||||
M.Mul(&S, RGB)
|
||||
|
||||
rgb2xyz.CloneFrom(&M)
|
||||
|
||||
if err = xyz2rgb.Inverse(&M); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
|
||||
// chromatic adaptation
|
||||
|
||||
crdS.MulVec((*mat.Dense)(&adaptation), W1)
|
||||
crdD.MulVec((*mat.Dense)(&adaptation), W2)
|
||||
|
||||
Mt := mat.NewDiagDense(3, []float64{
|
||||
crdD.AtVec(0) / crdS.AtVec(0),
|
||||
crdD.AtVec(1) / crdS.AtVec(1),
|
||||
crdD.AtVec(2) / crdS.AtVec(2),
|
||||
})
|
||||
|
||||
if err = tmp1.Inverse((*mat.Dense)(&adaptation)); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
tmp2.Mul(&tmp1, Mt)
|
||||
Mc.Mul(&tmp2, (*mat.Dense)(&adaptation))
|
||||
|
||||
source_destination_whites.CloneFrom(&Mc)
|
||||
|
||||
if err = destination_source_whites.Inverse(&Mc); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
M2.Mul(&Mc, &M)
|
||||
|
||||
adapted_rgb2xyz_2.CloneFrom(&M2)
|
||||
|
||||
if err = adapted_xyz2rgb_2.Inverse(&M2); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
RA.MulVec(&Mc, W1)
|
||||
|
||||
}
|
||||
*/
|
18
colorspace/color.go
Normal file
18
colorspace/color.go
Normal file
|
@ -0,0 +1,18 @@
|
|||
package colorspace
|
||||
|
||||
type ColorCoordinate [2]float64
|
||||
|
||||
func (c ColorCoordinate) X() float64 {
|
||||
return c[0]
|
||||
}
|
||||
|
||||
func (c ColorCoordinate) Y() float64 {
|
||||
return c[1]
|
||||
}
|
||||
|
||||
func (c ColorCoordinate) ToXYZ() (X, Y, Z float64) {
|
||||
X = c[0] / c[1]
|
||||
Y = 1.0
|
||||
Z = (1 - c[0] - c[1]) / c[1]
|
||||
return
|
||||
}
|
18
colorspace/illuminant.go
Normal file
18
colorspace/illuminant.go
Normal file
|
@ -0,0 +1,18 @@
|
|||
package colorspace
|
||||
|
||||
type Illuminant = ColorCoordinate
|
||||
|
||||
// Standard Illuminants in 2 degree form
|
||||
var (
|
||||
IlluminantD50 = Illuminant{0.34567, 0.35850}
|
||||
IlluminantD55 = Illuminant{0.33242, 0.34743}
|
||||
|
||||
// IlluminantD60 P3-D60 (ACES Cinema)
|
||||
IlluminantD60 = Illuminant{0.32168, 0.33767}
|
||||
|
||||
// IlluminantD63 P3-DCI (Theater)
|
||||
IlluminantD63 = Illuminant{0.314, 0.351}
|
||||
|
||||
// IlluminantD65 Standard D65 for Rec. 709, Rec. 2020, sRGB and many more
|
||||
IlluminantD65 = Illuminant{0.31271, 0.32902}
|
||||
)
|
79
colorspace/relative.go
Normal file
79
colorspace/relative.go
Normal file
|
@ -0,0 +1,79 @@
|
|||
package colorspace
|
||||
|
||||
type RelativeSystem struct {
|
||||
Chromaticity Chromaticity
|
||||
|
||||
fromLinearTransfer TransferFunction
|
||||
|
||||
YCbCr YCbCrConverter
|
||||
}
|
||||
|
||||
func (s RelativeSystem) FromLinear(c float64) float64 {
|
||||
return s.fromLinearTransfer(c)
|
||||
}
|
||||
|
||||
func NewRelativeSystem(chromaticity Chromaticity, fromLinearTransfer TransferFunction, converter YCbCrConverter) RelativeSystem {
|
||||
return RelativeSystem{
|
||||
Chromaticity: chromaticity,
|
||||
fromLinearTransfer: fromLinearTransfer,
|
||||
YCbCr: converter,
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
SystemSRGB = NewRelativeSystem(Chromaticity{
|
||||
Red: ColorCoordinate{0.640, 0.330},
|
||||
Green: ColorCoordinate{0.300, 0.600},
|
||||
Blue: ColorCoordinate{0.150, 0.060},
|
||||
White: IlluminantD65,
|
||||
}, CompandingSRGB, YCbCr_Rec709)
|
||||
|
||||
SystemRec709 = NewRelativeSystem(Chromaticity{
|
||||
Red: ColorCoordinate{0.640, 0.330},
|
||||
Green: ColorCoordinate{0.300, 0.600},
|
||||
Blue: ColorCoordinate{0.150, 0.060},
|
||||
White: IlluminantD65,
|
||||
}, CompandingRec709, YCbCr_Rec709)
|
||||
|
||||
SystemRec709_Pure = NewRelativeSystem(Chromaticity{
|
||||
Red: ColorCoordinate{0.640, 0.330},
|
||||
Green: ColorCoordinate{0.300, 0.600},
|
||||
Blue: ColorCoordinate{0.150, 0.060},
|
||||
White: IlluminantD65,
|
||||
}, PureRec709, YCbCr_Rec709)
|
||||
|
||||
SystemRec709_Pure22 = NewRelativeSystem(Chromaticity{
|
||||
Red: ColorCoordinate{0.640, 0.330},
|
||||
Green: ColorCoordinate{0.300, 0.600},
|
||||
Blue: ColorCoordinate{0.150, 0.060},
|
||||
White: IlluminantD65,
|
||||
}, PureRec709_22, YCbCr_Rec709)
|
||||
|
||||
SystemRec709_Pure24 = NewRelativeSystem(Chromaticity{
|
||||
Red: ColorCoordinate{0.640, 0.330},
|
||||
Green: ColorCoordinate{0.300, 0.600},
|
||||
Blue: ColorCoordinate{0.150, 0.060},
|
||||
White: IlluminantD65,
|
||||
}, PureRec709_24, YCbCr_Rec709)
|
||||
|
||||
SystemRec2020 = NewRelativeSystem(Chromaticity{
|
||||
Red: ColorCoordinate{0.708, 0.292},
|
||||
Green: ColorCoordinate{0.170, 0.797},
|
||||
Blue: ColorCoordinate{0.131, 0.046},
|
||||
White: IlluminantD65,
|
||||
}, CompandingRec2020, YCbCr_Rec2020)
|
||||
|
||||
SystemRec2020_Pure = NewRelativeSystem(Chromaticity{
|
||||
Red: ColorCoordinate{0.708, 0.292},
|
||||
Green: ColorCoordinate{0.170, 0.797},
|
||||
Blue: ColorCoordinate{0.131, 0.046},
|
||||
White: IlluminantD65,
|
||||
}, PureRec2020, YCbCr_Rec2020)
|
||||
|
||||
SystemRec2020_Pure24 = NewRelativeSystem(Chromaticity{
|
||||
Red: ColorCoordinate{0.708, 0.292},
|
||||
Green: ColorCoordinate{0.170, 0.797},
|
||||
Blue: ColorCoordinate{0.131, 0.046},
|
||||
White: IlluminantD65,
|
||||
}, PureRec2020, YCbCr_Rec2020)
|
||||
)
|
66
colorspace/types.go
Normal file
66
colorspace/types.go
Normal file
|
@ -0,0 +1,66 @@
|
|||
package colorspace
|
||||
|
||||
import "math"
|
||||
|
||||
type TransferFunction func(e float64) float64
|
||||
|
||||
const DCINormalizationFactor = 48 / 52.37
|
||||
|
||||
const (
|
||||
Gamma22 = 2.2
|
||||
Gamma24 = 2.4
|
||||
GammaDCIXYZ = 2.6
|
||||
GammaSRGB = Gamma22
|
||||
GammaRec709 = 1 / 0.45
|
||||
GammaRec2020 = GammaRec709
|
||||
)
|
||||
|
||||
// https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.2020-2-201510-I!!PDF-E.pdf
|
||||
const alpha = 1.09929682680944 // 10 * Math.pow(beta, 0.55)
|
||||
const beta = 0.018053968510807
|
||||
|
||||
var (
|
||||
TransferFromDCIXYZ TransferFunction = func(e float64) float64 {
|
||||
return e / DCINormalizationFactor
|
||||
}
|
||||
CompandingSRGB TransferFunction = func(e float64) float64 {
|
||||
if e <= 0.0031308 {
|
||||
return 12.92 * e
|
||||
} else {
|
||||
return 1.055*math.Pow(e, 1/GammaSRGB) - 0.055
|
||||
}
|
||||
}
|
||||
|
||||
// CompandingRec709 ITU-R BT.709
|
||||
CompandingRec709 TransferFunction = func(e float64) float64 {
|
||||
if e < beta {
|
||||
return 4.5 * e
|
||||
} else {
|
||||
return alpha*math.Pow(e, 1/GammaRec709) - (alpha - 1)
|
||||
}
|
||||
}
|
||||
PureRec709 TransferFunction = func(e float64) float64 {
|
||||
return math.Pow(e, 1/GammaRec709)
|
||||
}
|
||||
PureRec709_22 TransferFunction = func(e float64) float64 {
|
||||
return math.Pow(e, 1/Gamma22)
|
||||
}
|
||||
PureRec709_24 TransferFunction = func(e float64) float64 {
|
||||
return math.Pow(e, 1/Gamma24)
|
||||
}
|
||||
CompandingRec2020 = CompandingRec709
|
||||
PureRec2020 = PureRec709
|
||||
PureRec2020_22 = PureRec709_22
|
||||
PureRec2020_24 = PureRec709_24
|
||||
)
|
||||
|
||||
type LUT []float64
|
||||
|
||||
func NewGammaLUT(gamma float64, bits int) (lut LUT) {
|
||||
size := (1 << bits) - 1
|
||||
lut = make(LUT, size+1)
|
||||
for i := 0; i <= size; i++ {
|
||||
lut[i] = math.Pow(float64(i)/float64(size), gamma)
|
||||
}
|
||||
return lut
|
||||
}
|
28
colorspace/xyz.go
Normal file
28
colorspace/xyz.go
Normal file
|
@ -0,0 +1,28 @@
|
|||
package colorspace
|
||||
|
||||
import "math"
|
||||
|
||||
// XYZSystem An absolute representation
|
||||
type XYZSystem struct {
|
||||
gamma float64
|
||||
ToLinearLUT LUT
|
||||
// ToLinear any adjustements on top of gamma
|
||||
ToLinearTransfer TransferFunction
|
||||
}
|
||||
|
||||
func (s XYZSystem) ToLinearFrom16(c uint16) float64 {
|
||||
return s.ToLinearTransfer(s.ToLinearLUT[c])
|
||||
}
|
||||
|
||||
func (s XYZSystem) ToLinear(c float64) float64 {
|
||||
return s.ToLinearTransfer(math.Pow(c, s.gamma))
|
||||
}
|
||||
|
||||
func NewXYZSystem(gamma float64, toLinearTransfer TransferFunction) XYZSystem {
|
||||
return XYZSystem{
|
||||
ToLinearLUT: NewGammaLUT(gamma, 16),
|
||||
ToLinearTransfer: toLinearTransfer,
|
||||
}
|
||||
}
|
||||
|
||||
var DCIXYZSystem = NewXYZSystem(GammaDCIXYZ, TransferFromDCIXYZ)
|
35
colorspace/yuv.go
Normal file
35
colorspace/yuv.go
Normal file
|
@ -0,0 +1,35 @@
|
|||
package colorspace
|
||||
|
||||
import "gonum.org/v1/gonum/mat"
|
||||
|
||||
type YCbCrConverter struct {
|
||||
Kr, Kg, Kb float64
|
||||
}
|
||||
|
||||
func (c YCbCrConverter) ConversionRGB() (to, from *mat.Dense) {
|
||||
const half = 1. / 2.
|
||||
RgbToYPbPr := mat.NewDense(3, 3, []float64{
|
||||
c.Kr, c.Kg, c.Kb,
|
||||
-half * (c.Kr / (1 - c.Kb)), -half * (c.Kg / (1 - c.Kb)), half,
|
||||
half, -half * (c.Kg / (1 - c.Kr)), -half * (c.Kb / (1 - c.Kr)),
|
||||
})
|
||||
YPbPrToRgb := mat.NewDense(3, 3, []float64{
|
||||
1, 0, 2 - 2*c.Kr,
|
||||
1, -(c.Kb / c.Kg) * (2 - 2*c.Kb), -(c.Kr / c.Kg) * (2 - 2*c.Kr),
|
||||
1, 2 - 2*c.Kb, 0,
|
||||
})
|
||||
|
||||
return YPbPrToRgb, RgbToYPbPr
|
||||
}
|
||||
|
||||
func NewYCbCrConverter(kr, kg, kb float64) YCbCrConverter {
|
||||
return YCbCrConverter{
|
||||
Kr: kr,
|
||||
Kg: kg,
|
||||
Kb: kb,
|
||||
}
|
||||
}
|
||||
|
||||
var YCbCr_Rec709 = NewYCbCrConverter(0.2126, 0.7152, 0.0722)
|
||||
|
||||
var YCbCr_Rec2020 = NewYCbCrConverter(0.2127, 0.6780, 0.0593)
|
103
conv/conv.c
Normal file
103
conv/conv.c
Normal file
|
@ -0,0 +1,103 @@
|
|||
#include "conv.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const double uint16Max = (double)((1<<16)-1);
|
||||
const double int16MaxPlusOne = (double)(1<<15);
|
||||
|
||||
const double* restrict xyz2rgb_g;
|
||||
const double* restrict rgb2yuv_g;
|
||||
double rgbGamma_g;
|
||||
|
||||
|
||||
#if USE_SIMD
|
||||
#include "conv_gen.h"
|
||||
#endif
|
||||
|
||||
#if USE_SIMD && __AVX2__
|
||||
#include "conv_avx.h"
|
||||
|
||||
#if USE_512_WIDE_PIPELINE && __AVX512F__ && __AVX512VL__ && __AVX512BW__
|
||||
//512-bit wide pipeline
|
||||
typed_converter_platform(x86_64, SIMD_PUMPS, float, 5, 1, load_packed_512f, store_packed_512f);
|
||||
typed_converter_platform(x86_64, SIMD_PUMPS, double, 2, 1, load_packed_512, store_packed_512);
|
||||
|
||||
const char* DecoderInformation = "SIMD AVX-512 512-bit pipeline (2d 5f " str(SIMD_PUMPS) "pump)";
|
||||
#else
|
||||
|
||||
#if !defined(__FMA__)
|
||||
#define __FMA__ 0
|
||||
const char* DecoderInformation = "SIMD AVX2 256-bit pipeline (1d 2f " str(SIMD_PUMPS) "pump)";
|
||||
#else
|
||||
const char* DecoderInformation = "SIMD AVX2 + FMA 256-bit pipeline (1d 2f " str(SIMD_PUMPS) "pump)";
|
||||
#endif
|
||||
|
||||
//256-bit wide pipeline
|
||||
typed_converter_platform(x86_64, SIMD_PUMPS, float, 2, __FMA__, load_packed_256f, store_packed_256f);
|
||||
typed_converter_platform(x86_64, SIMD_PUMPS, double, 1, __FMA__, load_packed_256, store_packed_256);
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
//No ASM defined
|
||||
#define fType float
|
||||
#include "conv_generic.h"
|
||||
#undef fType
|
||||
#define fType double
|
||||
#include "conv_generic.h"
|
||||
|
||||
#undef USE_SIMD
|
||||
#define USE_SIMD 0
|
||||
|
||||
const char* DecoderInformation = "Generic scalar pipeline (1d 1f 1pump)";
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
const char* decoder_information() {
|
||||
return DecoderInformation;
|
||||
}
|
||||
|
||||
|
||||
void init(const double* restrict xyz2rgb, const double* restrict rgb2yuv, double xyzGamma, double rgbGamma) {
|
||||
xyz2rgb_g = xyz2rgb;
|
||||
rgb2yuv_g = rgb2yuv;
|
||||
rgbGamma_g = 1./rgbGamma;
|
||||
|
||||
for (int i = 0; i < (1 << XYZ_LOOKUP_TABLE_SIZE); i++) {
|
||||
xyz12_to_linear_double[i] = pow((double)(i)/((1 << XYZ_LOOKUP_TABLE_SIZE)-1), xyzGamma);
|
||||
xyz12_to_linear_float[i] = xyz12_to_linear_double[i];
|
||||
}
|
||||
|
||||
#if USE_SIMD
|
||||
_load_matrix_double(xyz2rgb_mat_double, xyz2rgb);
|
||||
_load_matrix_float(xyz2rgb_mat_float, xyz2rgb);
|
||||
_load_matrix_double(rgb2yuv_mat_double, rgb2yuv);
|
||||
_load_matrix_float(rgb2yuv_mat_float, rgb2yuv);
|
||||
#endif
|
||||
}
|
||||
|
||||
void convert_frame_dci_xyz12_to_yuv16_float(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) {
|
||||
for (int i = 0; i < height; i++){
|
||||
convert_line_dci_xyz12_to_yuv16_float(in, luma, cb, cr, width, height);
|
||||
|
||||
in += width*3;
|
||||
|
||||
luma += width;
|
||||
cb += width;
|
||||
cr += width;
|
||||
}
|
||||
}
|
||||
|
||||
void convert_frame_dci_xyz12_to_yuv16_double(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) {
|
||||
for (int i = 0; i < height; i++){
|
||||
convert_line_dci_xyz12_to_yuv16_double(in, luma, cb, cr, width, height);
|
||||
|
||||
in += width*3;
|
||||
|
||||
luma += width;
|
||||
cb += width;
|
||||
cr += width;
|
||||
}
|
||||
}
|
127
conv/conv.go
Normal file
127
conv/conv.go
Normal file
|
@ -0,0 +1,127 @@
|
|||
package conv
|
||||
|
||||
import (
|
||||
"gonum.org/v1/gonum/mat"
|
||||
"runtime"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -march=native -mtune=native -Ofast -std=c99
|
||||
#cgo LDFLAGS: -lm
|
||||
#include "conv.h"
|
||||
*/
|
||||
import "C"
|
||||
|
||||
var matPinner runtime.Pinner
|
||||
|
||||
func DecoderInformation() string {
|
||||
return C.GoString(C.decoder_information())
|
||||
}
|
||||
|
||||
func InitData(xyz2rgb, rgb2yuv *mat.Dense, xyzGamma, rgbGamma float64) {
|
||||
a := unsafe.Pointer(unsafe.SliceData(xyz2rgb.RawMatrix().Data))
|
||||
b := unsafe.Pointer(unsafe.SliceData(rgb2yuv.RawMatrix().Data))
|
||||
matPinner.Pin(a)
|
||||
matPinner.Pin(b)
|
||||
|
||||
C.init((*C.double)(a), (*C.double)(b), C.double(xyzGamma), C.double(rgbGamma))
|
||||
}
|
||||
|
||||
func ConvertFrameDCIXYZToYUV16Double(in []uint32, luma, cb, cr []uint16, width, height int) {
|
||||
|
||||
var pinner runtime.Pinner
|
||||
|
||||
inp := unsafe.Pointer(unsafe.SliceData(in))
|
||||
lumap := unsafe.Pointer(unsafe.SliceData(luma))
|
||||
cbp := unsafe.Pointer(unsafe.SliceData(cb))
|
||||
crp := unsafe.Pointer(unsafe.SliceData(cr))
|
||||
|
||||
pinner.Pin(inp)
|
||||
pinner.Pin(lumap)
|
||||
pinner.Pin(cbp)
|
||||
pinner.Pin(crp)
|
||||
defer pinner.Unpin()
|
||||
|
||||
C.convert_frame_dci_xyz12_to_yuv16_double(
|
||||
(*C.uint)(inp),
|
||||
(*C.ushort)(lumap),
|
||||
(*C.ushort)(cbp),
|
||||
(*C.ushort)(crp),
|
||||
C.int(width),
|
||||
C.int(height),
|
||||
)
|
||||
}
|
||||
|
||||
func ConvertLineDCIXYZToYUV16Double(in []uint32, luma, cb, cr []uint16, width, height int) {
|
||||
var pinner runtime.Pinner
|
||||
|
||||
inp := unsafe.Pointer(unsafe.SliceData(in))
|
||||
lumap := unsafe.Pointer(unsafe.SliceData(luma))
|
||||
cbp := unsafe.Pointer(unsafe.SliceData(cb))
|
||||
crp := unsafe.Pointer(unsafe.SliceData(cr))
|
||||
|
||||
pinner.Pin(inp)
|
||||
pinner.Pin(lumap)
|
||||
pinner.Pin(cbp)
|
||||
pinner.Pin(crp)
|
||||
defer pinner.Unpin()
|
||||
|
||||
C.convert_line_dci_xyz12_to_yuv16_double(
|
||||
(*C.uint)(inp),
|
||||
(*C.ushort)(lumap),
|
||||
(*C.ushort)(cbp),
|
||||
(*C.ushort)(crp),
|
||||
C.int(width),
|
||||
C.int(height),
|
||||
)
|
||||
}
|
||||
|
||||
func ConvertFrameDCIXYZToYUV16Float(in []uint32, luma, cb, cr []uint16, width, height int) {
|
||||
|
||||
var pinner runtime.Pinner
|
||||
|
||||
inp := unsafe.Pointer(unsafe.SliceData(in))
|
||||
lumap := unsafe.Pointer(unsafe.SliceData(luma))
|
||||
cbp := unsafe.Pointer(unsafe.SliceData(cb))
|
||||
crp := unsafe.Pointer(unsafe.SliceData(cr))
|
||||
|
||||
pinner.Pin(inp)
|
||||
pinner.Pin(lumap)
|
||||
pinner.Pin(cbp)
|
||||
pinner.Pin(crp)
|
||||
defer pinner.Unpin()
|
||||
|
||||
C.convert_frame_dci_xyz12_to_yuv16_float(
|
||||
(*C.uint)(inp),
|
||||
(*C.ushort)(lumap),
|
||||
(*C.ushort)(cbp),
|
||||
(*C.ushort)(crp),
|
||||
C.int(width),
|
||||
C.int(height),
|
||||
)
|
||||
}
|
||||
|
||||
func ConvertLineDCIXYZToYUV16Float(in []uint32, luma, cb, cr []uint16, width, height int) {
|
||||
var pinner runtime.Pinner
|
||||
|
||||
inp := unsafe.Pointer(unsafe.SliceData(in))
|
||||
lumap := unsafe.Pointer(unsafe.SliceData(luma))
|
||||
cbp := unsafe.Pointer(unsafe.SliceData(cb))
|
||||
crp := unsafe.Pointer(unsafe.SliceData(cr))
|
||||
|
||||
pinner.Pin(inp)
|
||||
pinner.Pin(lumap)
|
||||
pinner.Pin(cbp)
|
||||
pinner.Pin(crp)
|
||||
defer pinner.Unpin()
|
||||
|
||||
C.convert_line_dci_xyz12_to_yuv16_float(
|
||||
(*C.uint)(inp),
|
||||
(*C.ushort)(lumap),
|
||||
(*C.ushort)(cbp),
|
||||
(*C.ushort)(crp),
|
||||
C.int(width),
|
||||
C.int(height),
|
||||
)
|
||||
}
|
50
conv/conv.h
Normal file
50
conv/conv.h
Normal file
|
@ -0,0 +1,50 @@
|
|||
#include <stdint.h>
|
||||
|
||||
void init(const double* restrict xyz2rgb, const double* restrict rgb2yuv, double xyzGamma, double rgbGamma);
|
||||
|
||||
|
||||
// Use available SIMD. Disable to enforce generic pipeline.
|
||||
#if !defined(USE_SIMD)
|
||||
#define USE_SIMD 1
|
||||
#endif
|
||||
|
||||
// Opportunistically use AVX512 features even in 256-bit mode
|
||||
#if !defined(USE_OPPORTUNISTIC_AVX512)
|
||||
#define USE_OPPORTUNISTIC_AVX512 1
|
||||
#endif
|
||||
|
||||
//AVX2
|
||||
// double layout aaa0
|
||||
// float layout aaabbb00
|
||||
|
||||
// Enable usage of 512-bit wide pipeline, pumping two pixels every iteration, if supported.
|
||||
// Uses AVX-512 features. Requires AVX-512 F, VL, BW. Layout aaabbb00
|
||||
// double layout aaabbb00
|
||||
// float layout aaabbbcccdddeee0
|
||||
#if !defined(USE_512_WIDE_PIPELINE)
|
||||
#define USE_512_WIDE_PIPELINE 1
|
||||
#endif
|
||||
|
||||
// Sets the number of pumps per iteration on pipeline. Supported 1, 2, 4, 8
|
||||
// Set this if your architecture has large amount of executors than normal. Recommended to stay at 2 or 4.
|
||||
#if !defined(SIMD_PUMPS)
|
||||
#define SIMD_PUMPS 1
|
||||
#endif
|
||||
|
||||
// ExpandLoad or CompressStore is slower than currently doing set(vals...). todo: inspect with newer CPUs than ZEN4
|
||||
#if !defined(USE_AVX512_EXPANDLOAD)
|
||||
#define USE_AVX512_EXPANDLOAD 0
|
||||
#endif
|
||||
|
||||
// Size of the lookup table. Only valid value is 12.
|
||||
#if !defined(XYZ_LOOKUP_TABLE_SIZE)
|
||||
#define XYZ_LOOKUP_TABLE_SIZE 12
|
||||
#endif
|
||||
|
||||
// This function can run out of bounds slightly, about 24 bytes per pump per line (XYZ) or 8 bytes per pump per line (YUV), caller should allocates extra input/output buffer for this reason.
|
||||
void convert_line_dci_xyz12_to_yuv16_float(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
|
||||
void convert_frame_dci_xyz12_to_yuv16_float(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
|
||||
void convert_line_dci_xyz12_to_yuv16_double(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
|
||||
void convert_frame_dci_xyz12_to_yuv16_double(const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height);
|
||||
|
||||
const char* decoder_information();
|
271
conv/conv_avx.h
Normal file
271
conv/conv_avx.h
Normal file
|
@ -0,0 +1,271 @@
|
|||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// x86_64 AVX2 and AVX512 definitions
|
||||
|
||||
|
||||
|
||||
#define _perm_component_i(i, c, v) ((i*v+c)*2)
|
||||
#define _perm_component2(c) _perm_component_i(1, c, 4), _perm_component_i(0, c, 4)
|
||||
#define _perm_component5(c) _perm_component_i(4, c, 3), _perm_component_i(3, c, 3), _perm_component_i(2, c, 3), _perm_component_i(1, c, 3), _perm_component_i(0, c, 3)
|
||||
|
||||
#if USE_512_WIDE_PIPELINE && __AVX512F__ && __AVX512VL__ && __AVX512BW__ && __AVX512DQ__
|
||||
|
||||
__m256i load_packed_512(int j, const uint32_t* restrict in) {
|
||||
#if USE_AVX512_EXPANDLOAD
|
||||
//todo: this path is slower than _mm256_set_epi32, both mask and maskz ZEN4
|
||||
return _mm256_maskz_expandloadu_epi32(_cvtu32_mask8(0b01110111), in+j*3);
|
||||
#else
|
||||
return _mm256_permutexvar_epi32(_mm256_set_epi32(7, 5, 4, 3, 7, 2, 1, 0), _mm256_maskz_loadu_epi32(_cvtu32_mask8(0b00111111), in+j*3));
|
||||
#endif
|
||||
}
|
||||
|
||||
void store_packed_512(__m256i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
|
||||
#define _perm_idx512 _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _perm_component2(2), _perm_component2(1), _perm_component2(0))
|
||||
|
||||
__m128i store = _mm256_castsi256_si128(_mm256_permutexvar_epi16(_perm_idx512, packed));
|
||||
_mm_mask_storeu_epi16(&luma[j], _cvtu32_mask8(0b00000011), store);
|
||||
_mm_mask_storeu_epi16(&cb[j-2], _cvtu32_mask8(0b00001100), store);
|
||||
_mm_mask_storeu_epi16(&cr[j-4], _cvtu32_mask8(0b00110000), store);
|
||||
}
|
||||
|
||||
|
||||
__m512i load_packed_512f(int j, const uint32_t* restrict in) {
|
||||
return _mm512_maskz_loadu_epi32(_cvtu32_mask16(0b0111111111111111), in+j*3);
|
||||
}
|
||||
|
||||
void store_packed_512f(__m512i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
|
||||
#define _perm_idx512f _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _perm_component5(2), _perm_component5(1), _perm_component5(0))
|
||||
|
||||
__m256i store = _mm512_castsi512_si256(_mm512_permutexvar_epi16(_perm_idx512f, packed));
|
||||
_mm256_mask_storeu_epi16(&luma[j ], _cvtu32_mask16(0b0000000000011111), store);
|
||||
_mm256_mask_storeu_epi16(&cb[j -5], _cvtu32_mask16(0b0000001111100000), store);
|
||||
_mm256_mask_storeu_epi16(&cr[j-10], _cvtu32_mask16(0b0111110000000000), store);
|
||||
}
|
||||
|
||||
#define SSE 1
|
||||
#define AVX 1
|
||||
#define AVX512 1
|
||||
#include "simd_utils/simd_utils.h"
|
||||
|
||||
static inline __m512d pow512_pd1(__m512d x, double y1) {
|
||||
const __m512d y = _mm512_set1_pd(y1);
|
||||
return exp512_pd(_mm512_mul_pd(y, log512_pd(x)));
|
||||
}
|
||||
|
||||
static inline __m512 pow512_ps1(__m512 x, double y1) {
|
||||
const __m512d y = _mm512_set1_pd(y1);
|
||||
//low precision cause issues, use doubles
|
||||
__m256 a = _mm512_cvtpd_ps(exp512_pd(_mm512_mul_pd(y, log512_pd(_mm512_cvtps_pd(_mm512_castps512_ps256(x))))));
|
||||
__m256 b = _mm512_cvtpd_ps(exp512_pd(_mm512_mul_pd(y, log512_pd(_mm512_cvtps_pd(_mm512_extractf32x8_ps(x, 1))))));
|
||||
|
||||
return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
__m128i load_packed_256(int j, const uint32_t* restrict in) {
|
||||
#if USE_OPPORTUNISTIC_AVX512 && __AVX512F__ && __AVX512VL__ && __AVX512DQ__
|
||||
return _mm_maskz_loadu_epi32(_cvtu32_mask8(0b00000111), in+j*3);
|
||||
#else
|
||||
return _mm_loadu_epi32(in+j*3);
|
||||
//return _mm_set_epi32(0, inz[j], iny[j], inx[j]);
|
||||
#endif
|
||||
}
|
||||
|
||||
void store_packed_256(__m128i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
|
||||
luma[j] = ((int32_t*)(&packed))[0];
|
||||
cb[j] = ((int32_t*)(&packed))[1];
|
||||
cr[j] = ((int32_t*)(&packed))[2];
|
||||
}
|
||||
|
||||
/*#if USE_DOT_PRODUCT
|
||||
//position entries properly in vectors ready for dot product
|
||||
m[0] = _mm256_set_ps(0, in[2], in[1], in[0], 0, in[2], in[1], in[0]); //0rrr 0rrr
|
||||
m[1] = _mm256_set_ps(0, in[5], in[4], in[3], 0, in[5], in[4], in[3]); //0ggg 0ggg
|
||||
m[2] = _mm256_set_ps(0, in[8], in[7], in[6], 0, in[8], in[7], in[6]); //0bbb 0bbb
|
||||
|
||||
__m256 a = _mm256_dp_ps(v, m0, 0b01110001);
|
||||
__m256 b = _mm256_dp_ps(v, m1, 0b01110010);
|
||||
__m256 c = _mm256_dp_ps(v, m2, 0b01110100);
|
||||
return _mm256_blend_ps(_mm256_blend_ps(a, b, 0b00100010), c, 0b01000100);
|
||||
*/
|
||||
|
||||
__m256i load_packed_256f(int j, const uint32_t* restrict in) {
|
||||
#if USE_OPPORTUNISTIC_AVX512 && USE_AVX512_EXPANDLOAD && __AVX512VBMI2__ && __AVX512VL__ && __AVX512F__
|
||||
//todo: slow ZEN4
|
||||
return _mm256_maskz_expandloadu_epi32(_cvtu32_mask8(0b01110111), in+j*3);
|
||||
#else
|
||||
#if USE_OPPORTUNISTIC_AVX512 && __AVX512F__ && __AVX512BW__ && __AVX512VL__ && __AVX512DQ__
|
||||
return _mm256_permutexvar_epi32(_mm256_set_epi32(7, 5, 4, 3, 7, 2, 1, 0), _mm256_maskz_loadu_epi32(_cvtu32_mask8(0b00111111), in+j*3));
|
||||
#else
|
||||
return _mm256_set_epi32(0, in[j+5], in[j+4], in[j+3], 0, in[j+2], in[j+1], in[j]);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
void store_packed_256f(__m256i packed, int j, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr) {
|
||||
#if USE_OPPORTUNISTIC_AVX512 && __AVX512BW__ && __AVX512VL__ && __AVX512DQ__
|
||||
|
||||
#define _perm_idx256 _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _perm_component2(2), _perm_component2(1), _perm_component2(0))
|
||||
|
||||
__m128i store = _mm256_castsi256_si128(_mm256_permutexvar_epi16(_perm_idx256, packed));
|
||||
_mm_mask_storeu_epi16(&luma[j], _cvtu32_mask8(0b00000011), store);
|
||||
_mm_mask_storeu_epi16(&cb[j-2], _cvtu32_mask8(0b00001100), store);
|
||||
_mm_mask_storeu_epi16(&cr[j-4], _cvtu32_mask8(0b00110000), store);
|
||||
|
||||
#else
|
||||
luma[j/3] = ((int32_t*)(&packed))[0];
|
||||
cb[j/3] = ((int32_t*)(&packed))[1];
|
||||
cr[j/3] = ((int32_t*)(&packed))[2];
|
||||
|
||||
luma[j/3+1] = ((int32_t*)(&packed))[4];
|
||||
cb[j/3+1] = ((int32_t*)(&packed))[5];
|
||||
cr[j/3+1] = ((int32_t*)(&packed))[6];
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#define SSE 1
|
||||
#define AVX 1
|
||||
#include "simd_utils/simd_utils.h"
|
||||
|
||||
static inline __m256d pow256_pd1(__m256d x, double y1) {
|
||||
const __m256d y = _mm256_set1_pd(y1);
|
||||
return exp256_pd(_mm256_mul_pd(y, log256_pd(x)));
|
||||
}
|
||||
|
||||
static inline __m256 pow256_ps1(__m256 x, double y1) {
|
||||
const __m256d y = _mm256_set1_pd(y1);
|
||||
//low precision cause issues, use doubles
|
||||
__m128 a = _mm256_cvtpd_ps(exp256_pd(_mm256_mul_pd(y, log256_pd(_mm256_cvtps_pd(_mm256_castps256_ps128(x))))));
|
||||
__m128 b = _mm256_cvtpd_ps(exp256_pd(_mm256_mul_pd(y, log256_pd(_mm256_cvtps_pd(_mm256_extractf128_ps(x, 1))))));
|
||||
|
||||
return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
#define typed_vector_size_x86_64_float_5 512
|
||||
#define typed_vector_size_x86_64_double_2 512
|
||||
#define typed_vector_size_x86_64_float_2 256
|
||||
#define typed_vector_size_x86_64_double_1 256
|
||||
|
||||
#define typed_permute_lanes_x86_64_float_5 _permutexvar_
|
||||
#define typed_permute_lanes_x86_64_double_2 _permutex_
|
||||
#define typed_permute_lanes_x86_64_float_2 _permute_
|
||||
#define typed_permute_lanes_x86_64_double_1 _permute4x64_
|
||||
|
||||
|
||||
inline __attribute__((always_inline)) __m512 _int_i32gather_ps(void const* base_addr, __m512i vindex, int scale) {
|
||||
switch(scale){
|
||||
case 1:
|
||||
return _mm512_i32gather_ps(vindex, base_addr, 1);
|
||||
case 2:
|
||||
return _mm512_i32gather_ps(vindex, base_addr, 2);
|
||||
case 4:
|
||||
return _mm512_i32gather_ps(vindex, base_addr, 4);
|
||||
case 8:
|
||||
return _mm512_i32gather_ps(vindex, base_addr, 8);
|
||||
default:
|
||||
__builtin_unreachable();
|
||||
}
|
||||
}
|
||||
|
||||
inline __attribute__((always_inline)) __m512d _int_i32gather_pd(void const* base_addr, __m256i vindex, int scale) {
|
||||
switch(scale){
|
||||
case 1:
|
||||
return _mm512_i32gather_pd(vindex, base_addr, 1);
|
||||
case 2:
|
||||
return _mm512_i32gather_pd(vindex, base_addr, 2);
|
||||
case 4:
|
||||
return _mm512_i32gather_pd(vindex, base_addr, 4);
|
||||
case 8:
|
||||
return _mm512_i32gather_pd(vindex, base_addr, 8);
|
||||
default:
|
||||
__builtin_unreachable();
|
||||
}
|
||||
}
|
||||
|
||||
#define typed_pow1_x86_64_float_5 pow512_ps1
|
||||
#define typed_pow1_x86_64_double_2 pow512_pd1
|
||||
#define typed_pow1_x86_64_float_2 pow256_ps1
|
||||
#define typed_pow1_x86_64_double_1 pow256_pd1
|
||||
#define typed_pow1_x86_64(floatType, elementCount) typed_pow1_x86_64_##floatType##_##elementCount
|
||||
|
||||
#define typed_i32gather_x86_64_float_5 _int_i32gather_ps
|
||||
#define typed_i32gather_x86_64_double_2 _int_i32gather_pd
|
||||
#define typed_i32gather_x86_64_float_2 _mm256_i32gather_ps
|
||||
#define typed_i32gather_x86_64_double_1 _mm256_i32gather_pd
|
||||
#define typed_i32gather_x86_64(floatType, elementCount) typed_i32gather_x86_64_##floatType##_##elementCount
|
||||
|
||||
#define typed_vector_func_prefix_x86_64 _mm
|
||||
#define typed_vector_type_prefix_x86_64 __m
|
||||
#define typed_vector_type_suffix_x86_64_double d
|
||||
#define typed_vector_type_suffix_x86_64_int i
|
||||
|
||||
#define typed_vector_int_size_x86_64_float_5 512
|
||||
#define typed_vector_int_func_x86_64_float_5 concat(typed_vector_func_prefix_x86_64, typed_vector_int_size_x86_64_float_5)
|
||||
#define typed_vector_int_size_x86_64_double_2 256
|
||||
#define typed_vector_int_func_x86_64_double_2 concat(typed_vector_func_prefix_x86_64, typed_vector_int_size_x86_64_double_2)
|
||||
#define typed_vector_int_size_x86_64_float_2 256
|
||||
#define typed_vector_int_func_x86_64_float_2 concat(typed_vector_func_prefix_x86_64, typed_vector_int_size_x86_64_float_2)
|
||||
#define typed_vector_int_size_x86_64_double_1 128
|
||||
#define typed_vector_int_func_x86_64_double_1 typed_vector_func_prefix_x86_64
|
||||
|
||||
#define typed_vector_func_type_x86_64_float ps
|
||||
#define typed_vector_func_type_x86_64_double pd
|
||||
|
||||
#define typed_vector_type_x86_64_float(elementCount) concat(typed_vector_type_prefix_x86_64, typed_vector_size_x86_64_float_##elementCount)
|
||||
#define typed_vector_type_x86_64_double(elementCount) concat3(typed_vector_type_prefix_x86_64, typed_vector_size_x86_64_double_##elementCount, typed_vector_type_suffix_x86_64_double)
|
||||
|
||||
#define typed_vector_type_x86_64(floatType, elementCount) typed_vector_type_x86_64_##floatType(elementCount)
|
||||
|
||||
#define typed_vector_int_type_x86_64(floatType, elementCount) concat3(typed_vector_type_prefix_x86_64, typed_vector_int_size_x86_64_##floatType##_##elementCount, typed_vector_type_suffix_x86_64_int)
|
||||
|
||||
#define typed_vector_func_x86_64_float(elementCount) concat(typed_vector_func_prefix_x86_64, typed_vector_size_x86_64_float_##elementCount)
|
||||
#define typed_vector_func_x86_64_double(elementCount) concat(typed_vector_func_prefix_x86_64, typed_vector_size_x86_64_double_##elementCount)
|
||||
|
||||
#define typed_vector_func_x86_64(floatType, elementCount) typed_vector_func_x86_64_##floatType(elementCount)
|
||||
|
||||
|
||||
|
||||
#define typed_permute_lanes_x86_64_float_5 _permutexvar_
|
||||
#define typed_permute_lanes_x86_64_double_2 _permutex_
|
||||
#define typed_permute_lanes_x86_64_float_2 _permute_
|
||||
#define typed_permute_lanes_x86_64_double_1 _permute4x64_
|
||||
|
||||
#define typed_fmadd_x86_64 _fmadd_
|
||||
#define typed_add_x86_64 _add_
|
||||
#define typed_mul_x86_64 _mul_
|
||||
#define typed_set1_x86_64 _set1_
|
||||
#define typed_set_x86_64 _set_
|
||||
#define typed_min_x86_64 _min_
|
||||
#define typed_max_x86_64 _max_
|
||||
#define typed_seti_x86_64 _set_epi32
|
||||
#define typed_addi_x86_64 _add_epi32
|
||||
#define typed_cvt_x86_64_float _cvtps_epi32
|
||||
#define typed_cvt_x86_64_double _cvtpd_epi32
|
||||
|
||||
#define typed_func_x86_64_fmadd(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_fmadd_x86_64, typed_vector_func_type_x86_64_##floatType)
|
||||
#define typed_func_x86_64_add(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_add_x86_64, typed_vector_func_type_x86_64_##floatType)
|
||||
#define typed_func_x86_64_mul(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_mul_x86_64, typed_vector_func_type_x86_64_##floatType)
|
||||
#define typed_func_x86_64_set1(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_set1_x86_64, typed_vector_func_type_x86_64_##floatType)
|
||||
#define typed_func_x86_64_set(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_set_x86_64, typed_vector_func_type_x86_64_##floatType)
|
||||
#define typed_func_x86_64_min(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_min_x86_64, typed_vector_func_type_x86_64_##floatType)
|
||||
#define typed_func_x86_64_max(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_max_x86_64, typed_vector_func_type_x86_64_##floatType)
|
||||
#define typed_func_x86_64_ftoi(floatType, elementCount) concat(typed_vector_func_x86_64(floatType, elementCount), typed_cvt_x86_64_##floatType)
|
||||
#define typed_func_x86_64_seti(floatType, elementCount) concat(typed_vector_int_func_x86_64_##floatType##_##elementCount, typed_seti_x86_64)
|
||||
#define typed_func_x86_64_addi(floatType, elementCount) concat(typed_vector_int_func_x86_64_##floatType##_##elementCount, typed_addi_x86_64)
|
||||
|
||||
#define typed_func_x86_64_pow1(floatType, elementCount) typed_pow1_x86_64(floatType, elementCount)
|
||||
#define typed_func_x86_64_i32gather(floatType, elementCount) typed_i32gather_x86_64(floatType, elementCount)
|
||||
#define typed_func_x86_64_permute_lanes(floatType, elementCount) concat3(typed_vector_func_x86_64(floatType, elementCount), typed_permute_lanes_x86_64_##floatType##_##elementCount, typed_vector_func_type_x86_64_##floatType)
|
||||
|
||||
#define typed_permute_lanes_x86_64_float_5 _permutexvar_
|
||||
#define typed_permute_lanes_x86_64_double_2 _permutex_
|
||||
#define typed_permute_lanes_x86_64_float_2 _permute_
|
||||
#define typed_permute_lanes_x86_64_double_1 _permute4x64_
|
420
conv/conv_gen.h
Normal file
420
conv/conv_gen.h
Normal file
|
@ -0,0 +1,420 @@
|
|||
|
||||
|
||||
|
||||
#define _lit(s) s
|
||||
#define lit(s) _lit(s)
|
||||
#define _str(s) #s
|
||||
#define str(s) _str(s)
|
||||
#define _concat(a,b) a##b
|
||||
#define concat(a,b) _concat(a,b)
|
||||
#define _concat3(a,b,c) a##b##c
|
||||
#define concat3(a,b,c) _concat3(a,b,c)
|
||||
|
||||
|
||||
#if XYZ_LOOKUP_TABLE_SIZE==12
|
||||
#define typed_gather_double(i32gather, packed) i32gather(xyz12_to_linear_double, packed, 8)
|
||||
#define typed_gather_float(i32gather, packed) i32gather(xyz12_to_linear_float, packed, 4)
|
||||
#else
|
||||
#error "Not supported"
|
||||
#endif
|
||||
|
||||
#define _shuf_lane_step1(l) 3*l, 3*l+2, 3*l+1
|
||||
#define _shuf_lane_step2(l) 3*l+1, 3*l, 3*l+2
|
||||
#define _shuf_idx(seti, step) seti(0xf, _shuf_lane_step##step(4), _shuf_lane_step##step(3), _shuf_lane_step##step(2), _shuf_lane_step##step(1), _shuf_lane_step##step(0))
|
||||
|
||||
// xxxxxyyyyyzzzzz0
|
||||
|
||||
#define typed_mul_vec_dot_5_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
|
||||
fmadd( \
|
||||
permuteLanesVar(_shuf_idx(seti, 2), v), \
|
||||
m2, \
|
||||
fmadd( \
|
||||
permuteLanesVar(_shuf_idx(seti, 1), v), \
|
||||
m1, \
|
||||
mul(v, m0) \
|
||||
) \
|
||||
)
|
||||
|
||||
#define typed_mul_vec_dot_2_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
|
||||
fmadd( \
|
||||
permuteLanes(v, 0b11010010), \
|
||||
m2, \
|
||||
fmadd( \
|
||||
permuteLanes(v, 0b11001001), \
|
||||
m1, \
|
||||
mul(v, m0) \
|
||||
) \
|
||||
)
|
||||
|
||||
#define typed_mul_vec_dot_2_fma_0(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
|
||||
add( \
|
||||
add( \
|
||||
mul(v, m0), \
|
||||
mul(permuteLanes(v, 0b11001001), m1) \
|
||||
), \
|
||||
mul(permuteLanes(v, 0b11010010), m2) \
|
||||
)
|
||||
|
||||
#define typed_mul_vec_dot_1_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
|
||||
typed_mul_vec_dot_2_fma_1(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v)
|
||||
|
||||
#define typed_mul_vec_dot_1_fma_0(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v) \
|
||||
typed_mul_vec_dot_2_fma_0(fVectorType, seti, fmadd, mul, add, permuteLanes, m0, m1, m2, v)
|
||||
|
||||
|
||||
#define typed_mul_vec_dot_1(fma, fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
|
||||
typed_mul_vec_dot_1_fma_##fma(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v)
|
||||
|
||||
#define typed_mul_vec_dot_2(fma, fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
|
||||
typed_mul_vec_dot_2_fma_##fma(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v)
|
||||
|
||||
#define typed_mul_vec_dot_5(fma, fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v) \
|
||||
typed_mul_vec_dot_5_fma_##fma(fVectorType, seti, fmadd, mul, add, permuteLanesVar, m0, m1, m2, v)
|
||||
|
||||
#define typed_yuv_add_5(seti) \
|
||||
seti(0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0, int16MaxPlusOne, int16MaxPlusOne, 0)
|
||||
#define typed_yuv_add_2(seti) \
|
||||
seti(0, int16MaxPlusOne, int16MaxPlusOne, 0, 0, int16MaxPlusOne, int16MaxPlusOne, 0)
|
||||
#define typed_yuv_add_1(seti) \
|
||||
seti(0, int16MaxPlusOne, int16MaxPlusOne, 0)
|
||||
|
||||
|
||||
#define typed_mat_load_func_5(setf, m, in) \
|
||||
m[0] = setf(0, in[8], in[4], in[0], in[8], in[4], in[0], in[8], in[4], in[0], in[8], in[4], in[0], in[8], in[4], in[0]); /*0 bgr bgr bgr bgr bgr*/ \
|
||||
m[1] = setf(0, in[6], in[5], in[1], in[6], in[5], in[1], in[6], in[5], in[1], in[6], in[5], in[1], in[6], in[5], in[1]); /*0 rbg rbg rbg rbg rbg*/ \
|
||||
m[2] = setf(0, in[7], in[3], in[2], in[7], in[3], in[2], in[7], in[3], in[2], in[7], in[3], in[2], in[7], in[3], in[2]); /*0 grb grb grb grb grb*/
|
||||
|
||||
#define typed_mat_load_func_2(setf, m, in) \
|
||||
m[0] = setf(0, in[8], in[4], in[0], 0, in[8], in[4], in[0]); /* 0bgr 0bgr*/ \
|
||||
m[1] = setf(0, in[6], in[5], in[1], 0, in[6], in[5], in[1]); /* 0rbg 0rbg*/ \
|
||||
m[2] = setf(0, in[7], in[3], in[2], 0, in[7], in[3], in[2]); /* 0grb 0grb*/
|
||||
|
||||
#define typed_mat_load_func_1(setf, m, in) \
|
||||
m[0] = setf(0, in[8], in[4], in[0]); /* 0bgr*/ \
|
||||
m[1] = setf(0, in[6], in[5], in[1]); /* 0rbg*/ \
|
||||
m[2] = setf(0, in[7], in[3], in[2]); /* 0grb*/
|
||||
|
||||
// todo: optimize pow via https://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent ?
|
||||
|
||||
#define _load_matrix_func _load_matrix_
|
||||
#define _line_func convert_line_dci_xyz12_to_yuv16_
|
||||
//concat(_line_func, floatType)
|
||||
|
||||
#define typed_converter_pump1(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
|
||||
fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
|
||||
fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
|
||||
floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
|
||||
\
|
||||
void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
|
||||
typed_mat_load_func_##elementCount(setf, m, in) \
|
||||
}; \
|
||||
\
|
||||
\
|
||||
inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
|
||||
iPackedType packed; \
|
||||
fVectorType xyz; \
|
||||
fVectorType rgb; \
|
||||
fVectorType yuv; \
|
||||
\
|
||||
const fVectorType minValue = set1f(1.); \
|
||||
const fVectorType maxValue = set1f(0.); \
|
||||
const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
|
||||
\
|
||||
const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
|
||||
const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
|
||||
const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
|
||||
\
|
||||
const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
|
||||
const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
|
||||
const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
|
||||
\
|
||||
for (int j = 0; j < width; j += elementCount) { \
|
||||
packed = packedLoadFunc(j, in); \
|
||||
\
|
||||
xyz = typed_gather_##floatType(i32gather, packed); \
|
||||
\
|
||||
rgb = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz); \
|
||||
\
|
||||
rgb = maxf(minf(rgb, minValue), maxValue); \
|
||||
\
|
||||
rgb = fpow(rgb, rgbGamma_g); \
|
||||
\
|
||||
yuv = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb); \
|
||||
\
|
||||
packed = addi(ftoi(yuv), yuv2packed_add); \
|
||||
\
|
||||
packedStoreFunc(packed, j, luma, cb, cr); \
|
||||
} \
|
||||
};
|
||||
|
||||
|
||||
|
||||
#define typed_converter_pump2(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
|
||||
fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
|
||||
fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
|
||||
floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
|
||||
\
|
||||
void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
|
||||
typed_mat_load_func_##elementCount(setf, m, in) \
|
||||
}; \
|
||||
\
|
||||
\
|
||||
inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
|
||||
iPackedType packed0, packed1; \
|
||||
fVectorType xyz0, xyz1; \
|
||||
fVectorType rgb0, rgb1; \
|
||||
fVectorType yuv0, yuv1; \
|
||||
\
|
||||
const fVectorType minValue = set1f(1.); \
|
||||
const fVectorType maxValue = set1f(0.); \
|
||||
const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
|
||||
\
|
||||
const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
|
||||
const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
|
||||
const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
|
||||
\
|
||||
const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
|
||||
const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
|
||||
const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
|
||||
\
|
||||
for (int j = 0; j < width; j += elementCount*2) { \
|
||||
packed0 = packedLoadFunc(j, in); \
|
||||
packed1 = packedLoadFunc(j+elementCount, in); \
|
||||
\
|
||||
xyz0 = typed_gather_##floatType(i32gather, packed0); \
|
||||
xyz1 = typed_gather_##floatType(i32gather, packed1); \
|
||||
\
|
||||
rgb0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz0); \
|
||||
rgb1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz1); \
|
||||
\
|
||||
rgb0 = maxf(minf(rgb0, minValue), maxValue); \
|
||||
rgb1 = maxf(minf(rgb1, minValue), maxValue); \
|
||||
\
|
||||
rgb0 = fpow(rgb0, rgbGamma_g); \
|
||||
rgb1 = fpow(rgb1, rgbGamma_g); \
|
||||
\
|
||||
yuv0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb0); \
|
||||
yuv1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb1); \
|
||||
\
|
||||
packed0 = addi(ftoi(yuv0), yuv2packed_add); \
|
||||
packed1 = addi(ftoi(yuv1), yuv2packed_add); \
|
||||
\
|
||||
packedStoreFunc(packed0, j, luma, cb, cr); \
|
||||
packedStoreFunc(packed1, j+elementCount, luma, cb, cr); \
|
||||
} \
|
||||
};
|
||||
|
||||
|
||||
#define typed_converter_pump4(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
|
||||
fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
|
||||
fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
|
||||
floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
|
||||
\
|
||||
void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
|
||||
typed_mat_load_func_##elementCount(setf, m, in) \
|
||||
}; \
|
||||
\
|
||||
\
|
||||
inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
|
||||
iPackedType packed0, packed1, packed2, packed3; \
|
||||
fVectorType xyz0, xyz1, xyz2, xyz3; \
|
||||
fVectorType rgb0, rgb1, rgb2, rgb3; \
|
||||
fVectorType yuv0, yuv1, yuv2, yuv3; \
|
||||
\
|
||||
const fVectorType minValue = set1f(1.); \
|
||||
const fVectorType maxValue = set1f(0.); \
|
||||
const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
|
||||
\
|
||||
const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
|
||||
const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
|
||||
const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
|
||||
\
|
||||
const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
|
||||
const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
|
||||
const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
|
||||
\
|
||||
for (int j = 0; j < width; j += elementCount*4) { \
|
||||
packed0 = packedLoadFunc(j, in); \
|
||||
packed1 = packedLoadFunc(j+elementCount, in); \
|
||||
packed2 = packedLoadFunc(j+elementCount*2, in); \
|
||||
packed3 = packedLoadFunc(j+elementCount*3, in); \
|
||||
\
|
||||
xyz0 = typed_gather_##floatType(i32gather, packed0); \
|
||||
xyz1 = typed_gather_##floatType(i32gather, packed1); \
|
||||
xyz2 = typed_gather_##floatType(i32gather, packed2); \
|
||||
xyz3 = typed_gather_##floatType(i32gather, packed3); \
|
||||
\
|
||||
rgb0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz0); \
|
||||
rgb1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz1); \
|
||||
rgb2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz2); \
|
||||
rgb3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz3); \
|
||||
\
|
||||
rgb0 = maxf(minf(rgb0, minValue), maxValue); \
|
||||
rgb1 = maxf(minf(rgb1, minValue), maxValue); \
|
||||
rgb2 = maxf(minf(rgb2, minValue), maxValue); \
|
||||
rgb3 = maxf(minf(rgb3, minValue), maxValue); \
|
||||
\
|
||||
rgb0 = fpow(rgb0, rgbGamma_g); \
|
||||
rgb1 = fpow(rgb1, rgbGamma_g); \
|
||||
rgb2 = fpow(rgb2, rgbGamma_g); \
|
||||
rgb3 = fpow(rgb3, rgbGamma_g); \
|
||||
\
|
||||
yuv0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb0); \
|
||||
yuv1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb1); \
|
||||
yuv2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb2); \
|
||||
yuv3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb3); \
|
||||
\
|
||||
packed0 = addi(ftoi(yuv0), yuv2packed_add); \
|
||||
packed1 = addi(ftoi(yuv1), yuv2packed_add); \
|
||||
packed2 = addi(ftoi(yuv2), yuv2packed_add); \
|
||||
packed3 = addi(ftoi(yuv3), yuv2packed_add); \
|
||||
\
|
||||
packedStoreFunc(packed0, j, luma, cb, cr); \
|
||||
packedStoreFunc(packed1, j+elementCount, luma, cb, cr); \
|
||||
packedStoreFunc(packed2, j+elementCount*2, luma, cb, cr); \
|
||||
packedStoreFunc(packed3, j+elementCount*3, luma, cb, cr); \
|
||||
} \
|
||||
};
|
||||
|
||||
|
||||
|
||||
#define typed_converter_pump8(floatType, fVectorType, iPackedType, elementCount, set1f, setf, seti, fmaddf, mulf, addf, hasFMA, permuteLanesf, i32gather, minf, maxf, addi, ftoi, fpow, packedLoadFunc, packedStoreFunc) \
|
||||
fVectorType xyz2rgb_mat_##floatType[3] __attribute__((aligned(64))); \
|
||||
fVectorType rgb2yuv_mat_##floatType[3] __attribute__((aligned(64))); \
|
||||
floatType xyz12_to_linear_##floatType[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64))); \
|
||||
\
|
||||
void concat(_load_matrix_func, floatType) (fVectorType* m, const double* restrict in) { \
|
||||
typed_mat_load_func_##elementCount(setf, m, in) \
|
||||
}; \
|
||||
\
|
||||
\
|
||||
inline __attribute__((always_inline)) void concat(_line_func, floatType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) { \
|
||||
iPackedType packed0, packed1, packed2, packed3, packed4, packed5, packed6, packed7; \
|
||||
fVectorType xyz0, xyz1, xyz2, xyz3, xyz4, xyz5, xyz6, xyz7; \
|
||||
fVectorType rgb0, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7; \
|
||||
fVectorType yuv0, yuv1, yuv2, yuv3, yuv4, yuv5, yuv6, yuv7; \
|
||||
\
|
||||
const fVectorType minValue = set1f(1.); \
|
||||
const fVectorType maxValue = set1f(0.); \
|
||||
const iPackedType yuv2packed_add = typed_yuv_add_##elementCount(seti); \
|
||||
\
|
||||
const fVectorType x0 = xyz2rgb_mat_##floatType[0]; \
|
||||
const fVectorType x1 = xyz2rgb_mat_##floatType[1]; \
|
||||
const fVectorType x2 = xyz2rgb_mat_##floatType[2]; \
|
||||
\
|
||||
const fVectorType y0 = rgb2yuv_mat_##floatType[0]; \
|
||||
const fVectorType y1 = rgb2yuv_mat_##floatType[1]; \
|
||||
const fVectorType y2 = rgb2yuv_mat_##floatType[2]; \
|
||||
\
|
||||
for (int j = 0; j < width; j += elementCount*8) { \
|
||||
packed0 = packedLoadFunc(j, in); \
|
||||
packed1 = packedLoadFunc(j+elementCount, in); \
|
||||
packed2 = packedLoadFunc(j+elementCount*2, in); \
|
||||
packed3 = packedLoadFunc(j+elementCount*3, in); \
|
||||
packed4 = packedLoadFunc(j+elementCount*4, in); \
|
||||
packed5 = packedLoadFunc(j+elementCount*5, in); \
|
||||
packed6 = packedLoadFunc(j+elementCount*6, in); \
|
||||
packed7 = packedLoadFunc(j+elementCount*7, in); \
|
||||
\
|
||||
xyz0 = typed_gather_##floatType(i32gather, packed0); \
|
||||
xyz1 = typed_gather_##floatType(i32gather, packed1); \
|
||||
xyz2 = typed_gather_##floatType(i32gather, packed2); \
|
||||
xyz3 = typed_gather_##floatType(i32gather, packed3); \
|
||||
xyz4 = typed_gather_##floatType(i32gather, packed4); \
|
||||
xyz5 = typed_gather_##floatType(i32gather, packed5); \
|
||||
xyz6 = typed_gather_##floatType(i32gather, packed6); \
|
||||
xyz7 = typed_gather_##floatType(i32gather, packed7); \
|
||||
\
|
||||
rgb0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz0); \
|
||||
rgb1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz1); \
|
||||
rgb2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz2); \
|
||||
rgb3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz3); \
|
||||
rgb4 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz4); \
|
||||
rgb5 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz5); \
|
||||
rgb6 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz6); \
|
||||
rgb7 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, x0, x1, x2, xyz7); \
|
||||
\
|
||||
rgb0 = maxf(minf(rgb0, minValue), maxValue); \
|
||||
rgb1 = maxf(minf(rgb1, minValue), maxValue); \
|
||||
rgb2 = maxf(minf(rgb2, minValue), maxValue); \
|
||||
rgb3 = maxf(minf(rgb3, minValue), maxValue); \
|
||||
rgb4 = maxf(minf(rgb4, minValue), maxValue); \
|
||||
rgb5 = maxf(minf(rgb5, minValue), maxValue); \
|
||||
rgb6 = maxf(minf(rgb6, minValue), maxValue); \
|
||||
rgb7 = maxf(minf(rgb7, minValue), maxValue); \
|
||||
\
|
||||
rgb0 = fpow(rgb0, rgbGamma_g); \
|
||||
rgb1 = fpow(rgb1, rgbGamma_g); \
|
||||
rgb2 = fpow(rgb2, rgbGamma_g); \
|
||||
rgb3 = fpow(rgb3, rgbGamma_g); \
|
||||
rgb4 = fpow(rgb4, rgbGamma_g); \
|
||||
rgb5 = fpow(rgb5, rgbGamma_g); \
|
||||
rgb6 = fpow(rgb6, rgbGamma_g); \
|
||||
rgb7 = fpow(rgb7, rgbGamma_g); \
|
||||
\
|
||||
yuv0 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb0); \
|
||||
yuv1 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb1); \
|
||||
yuv2 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb2); \
|
||||
yuv3 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb3); \
|
||||
yuv4 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb4); \
|
||||
yuv5 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb5); \
|
||||
yuv6 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb6); \
|
||||
yuv7 = typed_mul_vec_dot_##elementCount(hasFMA, fVectorType, seti, fmaddf, mulf, addf, permuteLanesf, y0, y1, y2, rgb7); \
|
||||
\
|
||||
packed0 = addi(ftoi(yuv0), yuv2packed_add); \
|
||||
packed1 = addi(ftoi(yuv1), yuv2packed_add); \
|
||||
packed2 = addi(ftoi(yuv2), yuv2packed_add); \
|
||||
packed3 = addi(ftoi(yuv3), yuv2packed_add); \
|
||||
packed4 = addi(ftoi(yuv4), yuv2packed_add); \
|
||||
packed5 = addi(ftoi(yuv5), yuv2packed_add); \
|
||||
packed6 = addi(ftoi(yuv6), yuv2packed_add); \
|
||||
packed7 = addi(ftoi(yuv7), yuv2packed_add); \
|
||||
\
|
||||
packedStoreFunc(packed0, j, luma, cb, cr); \
|
||||
packedStoreFunc(packed1, j+elementCount, luma, cb, cr); \
|
||||
packedStoreFunc(packed2, j+elementCount*2, luma, cb, cr); \
|
||||
packedStoreFunc(packed3, j+elementCount*3, luma, cb, cr); \
|
||||
packedStoreFunc(packed4, j+elementCount*4, luma, cb, cr); \
|
||||
packedStoreFunc(packed5, j+elementCount*5, luma, cb, cr); \
|
||||
packedStoreFunc(packed6, j+elementCount*6, luma, cb, cr); \
|
||||
packedStoreFunc(packed7, j+elementCount*7, luma, cb, cr); \
|
||||
} \
|
||||
};
|
||||
|
||||
#define typed_func(platform, floatType, elementCount, func) typed_func_##platform##_##func(floatType, elementCount)
|
||||
|
||||
#define typed_vector_type(platform, floatType, elementCount) typed_vector_type_##platform(floatType, elementCount)
|
||||
#define typed_vector_int_type(platform, floatType, elementCount) typed_vector_int_type_##platform(floatType, elementCount)
|
||||
|
||||
#define typed_vector_int_size(platform, floatType, elementCount) typed_vector_int_size_##platform##_##floatType##_##elementCount
|
||||
|
||||
#define typed_vector_int_size(platform, floatType, elementCount) typed_vector_int_size_##platform##_##floatType##_##elementCount
|
||||
|
||||
#define typed_vector_int_func(platform, floatType, elementCount) typed_vector_int_funcsize_##platform##_##floatType##_##elementCount
|
||||
|
||||
#define typed_converter_pumps(pumps) concat(typed_converter_pump, pumps)
|
||||
|
||||
#define typed_converter_platform(platform, pumps, floatType, elementCount, hasFMA, packedLoadFunc, packedStoreFunc) \
|
||||
typed_converter_pumps(pumps)(floatType, \
|
||||
typed_vector_type(platform, floatType, elementCount), \
|
||||
typed_vector_int_type(platform, floatType, elementCount), \
|
||||
elementCount, \
|
||||
typed_func(platform, floatType, elementCount, set1), \
|
||||
typed_func(platform, floatType, elementCount, set), \
|
||||
typed_func(platform, floatType, elementCount, seti), \
|
||||
typed_func(platform, floatType, elementCount, fmadd), \
|
||||
typed_func(platform, floatType, elementCount, mul), \
|
||||
typed_func(platform, floatType, elementCount, add), \
|
||||
hasFMA, \
|
||||
typed_func(platform, floatType, elementCount, permute_lanes), \
|
||||
typed_func(platform, floatType, elementCount, i32gather), \
|
||||
typed_func(platform, floatType, elementCount, min), \
|
||||
typed_func(platform, floatType, elementCount, max), \
|
||||
typed_func(platform, floatType, elementCount, addi), \
|
||||
typed_func(platform, floatType, elementCount, ftoi), \
|
||||
typed_func(platform, floatType, elementCount, pow1), \
|
||||
packedLoadFunc, \
|
||||
packedStoreFunc \
|
||||
)
|
||||
|
||||
|
58
conv/conv_generic.h
Normal file
58
conv/conv_generic.h
Normal file
|
@ -0,0 +1,58 @@
|
|||
|
||||
#define xyz12_to_linear_name xyz12_to_linear_
|
||||
#define xyz12_to_linear_type(fType) concat(xyz12_to_linear_name, fType)
|
||||
|
||||
fType xyz12_to_linear_type(fType)[1 << XYZ_LOOKUP_TABLE_SIZE] __attribute__((aligned(64)));
|
||||
|
||||
#define clamp(v, min, max) (v < min ? min : (v > max ? max : v))
|
||||
|
||||
#define M(m, i, j) (m[i*3+j])
|
||||
|
||||
#define mxv_step_name mxv_step_
|
||||
#define _line_name convert_line_dci_xyz12_to_yuv16_
|
||||
|
||||
inline __attribute__((always_inline)) fType concat(mxv_step_name, fType) (const double* matrix, int step, fType a, fType b, fType c) {
|
||||
return M(matrix, step, 0)*a + M(matrix, step, 1)*b + M(matrix, step, 2)*c;
|
||||
}
|
||||
|
||||
inline __attribute__((always_inline)) void concat(_line_name, fType) (const uint32_t* restrict in, uint16_t* restrict luma, uint16_t* restrict cb, uint16_t* restrict cr, int width, int height) {
|
||||
fType xyz[3] __attribute__((aligned(32)));
|
||||
fType rgb[3] __attribute__((aligned(32)));
|
||||
fType yuv[3] __attribute__((aligned(32)));
|
||||
|
||||
const fType rgbGamma_l = rgbGamma_g;
|
||||
|
||||
for (int j = 0; j < width; j++) {
|
||||
xyz[0] = xyz12_to_linear_type(fType)[in[j*3] << (XYZ_LOOKUP_TABLE_SIZE - 12)];
|
||||
xyz[1] = xyz12_to_linear_type(fType)[in[j*3+1] << (XYZ_LOOKUP_TABLE_SIZE - 12)];
|
||||
xyz[2] = xyz12_to_linear_type(fType)[in[j*3+2] << (XYZ_LOOKUP_TABLE_SIZE - 12)];
|
||||
|
||||
rgb[0] = concat(mxv_step_name, fType)(xyz2rgb_g, 0, xyz[0], xyz[1], xyz[2]);
|
||||
rgb[1] = concat(mxv_step_name, fType)(xyz2rgb_g, 1, xyz[0], xyz[1], xyz[2]);
|
||||
rgb[2] = concat(mxv_step_name, fType)(xyz2rgb_g, 2, xyz[0], xyz[1], xyz[2]);
|
||||
|
||||
rgb[0] = clamp(rgb[0], 0., 1.);
|
||||
rgb[1] = clamp(rgb[1], 0., 1.);
|
||||
rgb[2] = clamp(rgb[2], 0., 1.);
|
||||
|
||||
// todo: optimize this via https://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent ?
|
||||
|
||||
if (sizeof(fType) == 4){
|
||||
rgb[0] = powf(rgb[0], rgbGamma_l);
|
||||
rgb[1] = powf(rgb[1], rgbGamma_l);
|
||||
rgb[2] = powf(rgb[2], rgbGamma_l);
|
||||
} else if (sizeof(fType) == 8){
|
||||
rgb[0] = pow(rgb[0], rgbGamma_l);
|
||||
rgb[1] = pow(rgb[1], rgbGamma_l);
|
||||
rgb[2] = pow(rgb[2], rgbGamma_l);
|
||||
}
|
||||
|
||||
yuv[0] = concat(mxv_step_name, fType) (rgb2yuv_g, 0, rgb[0], rgb[1], rgb[2]);
|
||||
yuv[1] = concat(mxv_step_name, fType) (rgb2yuv_g, 1, rgb[0], rgb[1], rgb[2]);
|
||||
yuv[2] = concat(mxv_step_name, fType) (rgb2yuv_g, 2, rgb[0], rgb[1], rgb[2]);
|
||||
|
||||
luma[j] = clamp((int)(round(yuv[0])), 0, uint16Max);
|
||||
cb[j] = clamp((int)(round(yuv[1])) + int16MaxPlusOne, 0, uint16Max);
|
||||
cr[j] = clamp((int)(round(yuv[2])) + int16MaxPlusOne, 0, uint16Max);
|
||||
}
|
||||
}
|
1
conv/simd_utils
Submodule
1
conv/simd_utils
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit e0aa01336b63d0c9d351a09dc24e0b22483219ad
|
80
convert.go
Normal file
80
convert.go
Normal file
|
@ -0,0 +1,80 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/colorspace"
|
||||
"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/conv"
|
||||
"gonum.org/v1/gonum/mat"
|
||||
)
|
||||
|
||||
var useCConverter, useFloat bool
|
||||
|
||||
func ConvertFrame(in []uint32, y, cb, cr []uint16, width, height int) {
|
||||
if useCConverter {
|
||||
if useFloat {
|
||||
conv.ConvertFrameDCIXYZToYUV16Float(in, y, cb, cr, width, height)
|
||||
} else {
|
||||
conv.ConvertFrameDCIXYZToYUV16Double(in, y, cb, cr, width, height)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
for i := 0; i < height; i++ {
|
||||
ConvertLine(in, y, cb, cr, width, height)
|
||||
|
||||
in = in[width*3:]
|
||||
|
||||
y = y[width:]
|
||||
cb = cb[width:]
|
||||
cr = cr[width:]
|
||||
}
|
||||
}
|
||||
|
||||
func ConvertLine(in []uint32, y, cb, cr []uint16, width, height int) {
|
||||
if useCConverter {
|
||||
if useFloat {
|
||||
conv.ConvertLineDCIXYZToYUV16Float(in, y, cb, cr, width, height)
|
||||
} else {
|
||||
conv.ConvertLineDCIXYZToYUV16Double(in, y, cb, cr, width, height)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
var r, g, b float64
|
||||
xyz, rgb, yuv := mat.NewVecDense(3, nil), mat.NewVecDense(3, nil), mat.NewVecDense(3, nil)
|
||||
|
||||
for j := 0; j < width; j++ {
|
||||
// LUT
|
||||
xyz.SetVec(0, colorspace.DCIXYZSystem.ToLinearLUT[uint16(in[j*3])<<4])
|
||||
xyz.SetVec(1, colorspace.DCIXYZSystem.ToLinearLUT[uint16(in[j*3+1])<<4])
|
||||
xyz.SetVec(2, colorspace.DCIXYZSystem.ToLinearLUT[uint16(in[j*3+2])<<4])
|
||||
|
||||
//TODO: apply white point correction here if necessary (but after denorm)
|
||||
|
||||
//denormalize + xyz2rgb
|
||||
rgb.MulVec(xyz2rgbDenorm, xyz)
|
||||
|
||||
//todo: some out of bounds r,g,b come up from here, maybe just fine
|
||||
//clamp values into proper values. necessary due to XYZ ranges, todo: check conversion matrix for preserving this
|
||||
r = min(1.0, max(0, rgb.AtVec(0)))
|
||||
g = min(1.0, max(0, rgb.AtVec(1)))
|
||||
b = min(1.0, max(0, rgb.AtVec(2)))
|
||||
|
||||
// companding / adjustment with gamma curve
|
||||
//TODO: why is it not using normal Rec709 and instead using straight gamma curve
|
||||
r = space.FromLinear(r)
|
||||
g = space.FromLinear(g)
|
||||
b = space.FromLinear(b)
|
||||
|
||||
rgb.SetVec(0, r)
|
||||
rgb.SetVec(1, g)
|
||||
rgb.SetVec(2, b)
|
||||
|
||||
yuv.MulVec(rgb2yuv, rgb)
|
||||
|
||||
// map RGB to components
|
||||
// scale float range to 16bit precision, in full swing
|
||||
y[j] = LumaToFull16(yuv.AtVec(0))
|
||||
cb[j] = ChromaToFull16(yuv.AtVec(1))
|
||||
cr[j] = ChromaToFull16(yuv.AtVec(2))
|
||||
}
|
||||
}
|
5
go.mod
Normal file
5
go.mod
Normal file
|
@ -0,0 +1,5 @@
|
|||
module git.gammaspectra.live/WeebDataHoarder/xyz2yuv
|
||||
|
||||
go 1.21
|
||||
|
||||
require gonum.org/v1/gonum v0.14.0
|
4
go.sum
Normal file
4
go.sum
Normal file
|
@ -0,0 +1,4 @@
|
|||
golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
|
||||
golang.org/x/exp v0.0.0-20230321023759-10a507213a29/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
|
||||
gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0=
|
||||
gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU=
|
27
job.go
Normal file
27
job.go
Normal file
|
@ -0,0 +1,27 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
)
|
||||
|
||||
type frameJobData struct {
|
||||
wg *sync.WaitGroup
|
||||
frame int
|
||||
height int
|
||||
width int
|
||||
in []uint32
|
||||
y []uint16
|
||||
cb []uint16
|
||||
cr []uint16
|
||||
}
|
||||
|
||||
func (job frameJobData) Process() {
|
||||
defer job.wg.Done()
|
||||
|
||||
ConvertFrame(job.in, job.y, job.cb, job.cr, job.width, job.height)
|
||||
}
|
||||
|
||||
type decodedFrame struct {
|
||||
Number int
|
||||
Frame []uint32
|
||||
}
|
102
libav/libav.go
Normal file
102
libav/libav.go
Normal file
|
@ -0,0 +1,102 @@
|
|||
package libav
|
||||
|
||||
/*
|
||||
#cgo pkg-config: libavformat libavcodec libavutil
|
||||
|
||||
#include <libavformat/avformat.h>
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"slices"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
type PacketData struct {
|
||||
Number int
|
||||
Data []byte
|
||||
}
|
||||
|
||||
func OpenXYZ12(inputFile string, initFunc func(framerateNum, framerateDen, sarNum, sarDen, width, height int) error, packetFunc func(packet PacketData) error) error {
|
||||
var fmtCtx *C.AVFormatContext
|
||||
|
||||
//Open file and decoder
|
||||
|
||||
inputFileC := C.CString(inputFile)
|
||||
defer C.free(unsafe.Pointer(inputFileC))
|
||||
|
||||
ret := C.avformat_open_input(&fmtCtx, inputFileC, nil, nil)
|
||||
if ret < 0 {
|
||||
panic("cannot open input file")
|
||||
}
|
||||
defer C.avformat_close_input(&fmtCtx)
|
||||
|
||||
ret = C.avformat_find_stream_info(fmtCtx, nil)
|
||||
if ret < 0 {
|
||||
panic("cannot find stream information")
|
||||
}
|
||||
|
||||
//get video
|
||||
ret = C.av_find_best_stream(fmtCtx, C.AVMEDIA_TYPE_VIDEO, -1, -1, nil, 0)
|
||||
if ret < 0 {
|
||||
panic("cannot find video stream")
|
||||
}
|
||||
|
||||
videoStreamIndex := ret
|
||||
|
||||
inputStream := unsafe.Slice(fmtCtx.streams, fmtCtx.nb_streams)[videoStreamIndex]
|
||||
|
||||
codecPar := inputStream.codecpar
|
||||
|
||||
if codecPar.codec_id != C.AV_CODEC_ID_JPEG2000 {
|
||||
panic("video codec not JPEG2000")
|
||||
}
|
||||
|
||||
if codecPar.format != C.AV_PIX_FMT_XYZ12LE {
|
||||
panic("video format not xyz12le")
|
||||
}
|
||||
|
||||
codecPar.color_range = C.AVCOL_RANGE_JPEG
|
||||
codecPar.color_primaries = C.AVCOL_PRI_SMPTE431
|
||||
//codecPar.color_trc = C.AVCOL_PRI_SMPTE431
|
||||
//codecPar.color_space = C.AVCOL_PRI_SMPTE431
|
||||
|
||||
var packet *C.AVPacket
|
||||
packet = C.av_packet_alloc()
|
||||
if packet == nil {
|
||||
panic("err allocating")
|
||||
}
|
||||
defer C.av_packet_free(&packet)
|
||||
|
||||
err := initFunc(int(inputStream.codec.framerate.num), int(inputStream.codec.framerate.den), int(inputStream.sample_aspect_ratio.num), int(inputStream.sample_aspect_ratio.den), int(codecPar.width), int(codecPar.height))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var frameNumber int
|
||||
|
||||
for {
|
||||
ret = C.av_read_frame(fmtCtx, packet)
|
||||
if ret < 0 {
|
||||
break
|
||||
}
|
||||
if packet.stream_index == videoStreamIndex {
|
||||
err = packetFunc(PacketData{
|
||||
Number: frameNumber,
|
||||
Data: slices.Clone(unsafe.Slice((*byte)(packet.data), int(packet.size))),
|
||||
})
|
||||
if err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
frameNumber++
|
||||
}
|
||||
C.av_packet_unref(packet)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
141
libopenjp2/libopenjp2.go
Normal file
141
libopenjp2/libopenjp2.go
Normal file
|
@ -0,0 +1,141 @@
|
|||
package libopenjp2
|
||||
|
||||
/*
|
||||
#cgo pkg-config: libopenjp2
|
||||
|
||||
#include "libopenjp2.h"
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"runtime"
|
||||
"slices"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
//FFMpeg removed libopenjpeg decoder https://github.com/FFmpeg/FFmpeg/commit/60ccb3fe787be3bb10fc4545b3593cd1e0b769ed
|
||||
|
||||
const Jp2SigType = 0x6A502020
|
||||
const Jp2SigValue = 0x0D0A870A
|
||||
|
||||
type Jpeg2000Decoder struct {
|
||||
ctx C.opj_dparameters_t
|
||||
}
|
||||
|
||||
const QualityLayersAll = 0
|
||||
const ResolutionLayersAll = 0
|
||||
|
||||
func NewJpeg2000Decoder(qualityLayers, resolutionLayers uint) (*Jpeg2000Decoder, error) {
|
||||
d := &Jpeg2000Decoder{}
|
||||
C.opj_set_default_decoder_parameters(&d.ctx)
|
||||
|
||||
d.ctx.cp_layer = C.uint32_t(qualityLayers)
|
||||
d.ctx.cp_reduce = C.uint32_t(resolutionLayers)
|
||||
|
||||
return d, nil
|
||||
}
|
||||
|
||||
var jp2c = binary.LittleEndian.Uint32([]byte("jp2c"))
|
||||
|
||||
type Jpeg2000Frame struct {
|
||||
Width, Height int
|
||||
X, Y, Z []uint32
|
||||
}
|
||||
|
||||
func (d *Jpeg2000Decoder) DecodeFrame(buf []byte) (frame *Jpeg2000Frame, err error) {
|
||||
var image *C.opj_image_t
|
||||
var dec *C.opj_codec_t
|
||||
var stream *C.opj_stream_t
|
||||
|
||||
var pinner runtime.Pinner
|
||||
pinner.Pin(unsafe.Pointer(unsafe.SliceData(buf)))
|
||||
defer pinner.Unpin()
|
||||
|
||||
// Check if input is a raw jpeg2k codestream or in jp2 wrapping
|
||||
if (binary.LittleEndian.Uint32(buf) == 12) &&
|
||||
(binary.LittleEndian.Uint32(buf[4:]) == Jp2SigType) &&
|
||||
(binary.LittleEndian.Uint32(buf[8:]) == Jp2SigValue) {
|
||||
dec = C.opj_create_decompress(C.OPJ_CODEC_JP2)
|
||||
} else {
|
||||
/* If the AVPacket contains a jp2c box, then skip to
|
||||
* the starting byte of the codestream. */
|
||||
if binary.LittleEndian.Uint32(buf[4:]) == jp2c {
|
||||
buf = buf[8:]
|
||||
}
|
||||
dec = C.opj_create_decompress(C.OPJ_CODEC_J2K)
|
||||
}
|
||||
|
||||
if dec == nil {
|
||||
return nil, errors.New("error initializing decoder")
|
||||
}
|
||||
defer C.opj_destroy_codec(dec)
|
||||
|
||||
// Tie decoder with decoding parameters
|
||||
C.opj_setup_decoder(dec, &d.ctx)
|
||||
|
||||
stream = C.opj_stream_default_create(C.OPJ_STREAM_READ)
|
||||
defer C.opj_stream_destroy(stream)
|
||||
|
||||
if stream == nil {
|
||||
return nil, errors.New("error initializing stream")
|
||||
}
|
||||
|
||||
reader := &C.BufferReader{
|
||||
pos: 0,
|
||||
size: C.int(len(buf)),
|
||||
buffer: (*C.uchar)(unsafe.Pointer(unsafe.SliceData(buf))),
|
||||
}
|
||||
|
||||
pinner.Pin(reader)
|
||||
|
||||
C.set_stream_callbacks(stream)
|
||||
C.opj_stream_set_user_data(stream, unsafe.Pointer(reader), nil)
|
||||
C.opj_stream_set_user_data_length(stream, C.ulong(len(buf)))
|
||||
|
||||
ret := C.opj_read_header(stream, dec, &image)
|
||||
defer C.opj_image_destroy(image)
|
||||
|
||||
if ret != 1 {
|
||||
return nil, errors.New("error decoding stream header")
|
||||
}
|
||||
|
||||
if image.numcomps != 3 {
|
||||
return nil, fmt.Errorf("unexpected component number %d", image.numcomps)
|
||||
}
|
||||
|
||||
components := unsafe.Slice(image.comps, int(image.numcomps))
|
||||
|
||||
for i, c := range components {
|
||||
if c.prec != 12 {
|
||||
return nil, fmt.Errorf("unexpected component %d bit depth %d", i, c.prec)
|
||||
}
|
||||
}
|
||||
|
||||
ret = C.opj_decode(dec, stream, image)
|
||||
|
||||
if ret != 1 {
|
||||
return nil, errors.New("error decoding image")
|
||||
}
|
||||
|
||||
for i, c := range components {
|
||||
if c.data == nil {
|
||||
return nil, fmt.Errorf("component %d has no data", i)
|
||||
}
|
||||
}
|
||||
|
||||
readComponent := func(index int) []uint32 {
|
||||
return unsafe.Slice((*uint32)(unsafe.Pointer(components[index].data)), int(components[index].h)*int(components[index].w))
|
||||
}
|
||||
|
||||
frame = &Jpeg2000Frame{
|
||||
Width: int(components[0].w),
|
||||
Height: int(components[0].h),
|
||||
X: slices.Clone(readComponent(0)),
|
||||
Y: slices.Clone(readComponent(1)),
|
||||
Z: slices.Clone(readComponent(2)),
|
||||
}
|
||||
|
||||
return frame, nil
|
||||
}
|
67
libopenjp2/libopenjp2.h
Normal file
67
libopenjp2/libopenjp2.h
Normal file
|
@ -0,0 +1,67 @@
|
|||
#include <string.h>
|
||||
|
||||
#include <openjpeg.h>
|
||||
|
||||
typedef struct BufferReader {
|
||||
int pos;
|
||||
int size;
|
||||
const uint8_t *buffer;
|
||||
} BufferReader;
|
||||
|
||||
static OPJ_SIZE_T stream_read(void *out_buffer, OPJ_SIZE_T nb_bytes, void *user_data) {
|
||||
BufferReader *reader = user_data;
|
||||
int remaining;
|
||||
|
||||
if (reader->pos == reader->size) {
|
||||
return (OPJ_SIZE_T)-1;
|
||||
}
|
||||
remaining = reader->size - reader->pos;
|
||||
if (nb_bytes > remaining) {
|
||||
nb_bytes = remaining;
|
||||
}
|
||||
memcpy(out_buffer, reader->buffer + reader->pos, nb_bytes);
|
||||
reader->pos += (int)nb_bytes;
|
||||
return nb_bytes;
|
||||
}
|
||||
|
||||
static OPJ_OFF_T stream_skip(OPJ_OFF_T nb_bytes, void *user_data)
|
||||
{
|
||||
BufferReader *reader = user_data;
|
||||
if (nb_bytes < 0) {
|
||||
if (reader->pos == 0) {
|
||||
return (OPJ_SIZE_T)-1;
|
||||
}
|
||||
if (nb_bytes + reader->pos < 0) {
|
||||
nb_bytes = -reader->pos;
|
||||
}
|
||||
} else {
|
||||
int remaining;
|
||||
|
||||
if (reader->pos == reader->size) {
|
||||
return (OPJ_SIZE_T)-1;
|
||||
}
|
||||
remaining = reader->size - reader->pos;
|
||||
if (nb_bytes > remaining) {
|
||||
nb_bytes = remaining;
|
||||
}
|
||||
}
|
||||
reader->pos += (int)nb_bytes;
|
||||
return nb_bytes;
|
||||
}
|
||||
|
||||
static OPJ_BOOL stream_seek(OPJ_OFF_T nb_bytes, void *user_data)
|
||||
{
|
||||
BufferReader *reader = user_data;
|
||||
if (nb_bytes < 0 || nb_bytes > reader->size) {
|
||||
return OPJ_FALSE;
|
||||
}
|
||||
reader->pos = (int)nb_bytes;
|
||||
return OPJ_TRUE;
|
||||
}
|
||||
|
||||
|
||||
static void set_stream_callbacks(opj_stream_t* stream) {
|
||||
opj_stream_set_read_function(stream, stream_read);
|
||||
opj_stream_set_skip_function(stream, stream_skip);
|
||||
opj_stream_set_seek_function(stream, stream_seek);
|
||||
}
|
29
parameters.go
Normal file
29
parameters.go
Normal file
|
@ -0,0 +1,29 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"gonum.org/v1/gonum/mat"
|
||||
"math"
|
||||
)
|
||||
|
||||
func LumaToFull16(l float64) uint16 {
|
||||
return uint16(min(math.MaxUint16, max(0, int(math.Round(l*math.MaxUint16)))))
|
||||
}
|
||||
|
||||
func ChromaToFull16(c float64) uint16 {
|
||||
return uint16(min(math.MaxUint16, max(0, int(math.Round(c*math.MaxUint16+math.MaxInt16+1)))))
|
||||
}
|
||||
|
||||
// RoundMatToPrecision Applies rounding to each Matrix entry to limit precision
|
||||
func RoundMatToPrecision(m *mat.Dense, decimals int) *mat.Dense {
|
||||
var o mat.Dense
|
||||
if decimals <= 0 {
|
||||
o.CloneFrom(m)
|
||||
return &o
|
||||
}
|
||||
factor := 10. * float64(decimals)
|
||||
|
||||
o.Apply(func(i, j int, v float64) float64 {
|
||||
return math.Round(v*factor) / factor
|
||||
}, m)
|
||||
return &o
|
||||
}
|
5
scripts/README.md
Normal file
5
scripts/README.md
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Script collection for playback
|
||||
|
||||
```shell
|
||||
$ [script].sh "input.mkv"
|
||||
```
|
14
scripts/playback-mpv.sh
Executable file
14
scripts/playback-mpv.sh
Executable file
|
@ -0,0 +1,14 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Playbacks an XYZ input file straight via mpv
|
||||
|
||||
cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
|
||||
|
||||
../bin/xyz2yuv \
|
||||
-in "${1}" \
|
||||
-colorspace "${2}" \
|
||||
-out - | mpv \
|
||||
--demuxer-max-bytes=4096MiB --cache=yes --cache-secs=30 \
|
||||
--force-seekable=yes - \
|
||||
--external-file="${1}" \
|
||||
--vid=1 --aid=1
|
7
scripts/playback-rec709-mpv.sh
Executable file
7
scripts/playback-rec709-mpv.sh
Executable file
|
@ -0,0 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Playbacks an XYZ input file straight via mpv
|
||||
|
||||
cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
|
||||
|
||||
./playback-mpv.sh "${1}" rec709_pure
|
7
scripts/playback-rec709_22-mpv.sh
Executable file
7
scripts/playback-rec709_22-mpv.sh
Executable file
|
@ -0,0 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Playbacks an XYZ input file straight via mpv
|
||||
|
||||
cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
|
||||
|
||||
./playback-mpv.sh "${1}" rec709_pure22
|
7
scripts/playback-rec709_24-mpv.sh
Executable file
7
scripts/playback-rec709_24-mpv.sh
Executable file
|
@ -0,0 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Playbacks an XYZ input file straight via mpv
|
||||
|
||||
cd "$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
|
||||
|
||||
./playback-mpv.sh "${1}" rec709_pure24
|
427
xyz2yuv.go
Normal file
427
xyz2yuv.go
Normal file
|
@ -0,0 +1,427 @@
|
|||
package main
|
||||
|
||||
import "C"
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"flag"
|
||||
"fmt"
|
||||
"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/colorspace"
|
||||
"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/conv"
|
||||
"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/libav"
|
||||
"git.gammaspectra.live/WeebDataHoarder/xyz2yuv/libopenjp2"
|
||||
"gonum.org/v1/gonum/mat"
|
||||
"io"
|
||||
"math"
|
||||
"os"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
var space colorspace.RelativeSystem
|
||||
|
||||
var xyz2rgb *mat.Dense
|
||||
var xyz2rgbDenorm *mat.Dense
|
||||
|
||||
var rgb2yuv *mat.Dense
|
||||
var rgb2yuvPremultiplied *mat.Dense
|
||||
|
||||
var rgbGamma float64
|
||||
|
||||
func ToPacked(a, b, c []uint32, extra int) []uint32 {
|
||||
if len(a) != len(b) || len(a) != len(c) {
|
||||
panic("lengths mismatch")
|
||||
}
|
||||
|
||||
out := make([]uint32, len(a)*3, len(a)*3+extra)
|
||||
for i := range a {
|
||||
out[i*3] = a[i]
|
||||
out[i*3+1] = b[i]
|
||||
out[i*3+2] = c[i]
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func main() {
|
||||
inFile := flag.String("in", "", "Input file")
|
||||
startFrame := flag.Uint64("start", 0, "Start frame number inclusive")
|
||||
endFrame := flag.Uint64("end", math.MaxUint64, "End frame number exclusive")
|
||||
outFile := flag.String("out", "-", "Output file. Use - for stdout")
|
||||
colorspaceRelativeSystem := flag.String("colorspace", "rec709_pure", "Colorspace and parameters to convert into. Supported: rec709, rec709_pure, rec709_pure22, rec709_pure24, rec2020, rec2020_pure, rec2020_pure24")
|
||||
xyzPrecision := flag.Int("precision-xyz2rgb", 0, "XYZ -> RGB conversion matrix precision. 0 = maximum")
|
||||
rgbPrecision := flag.Int("precision-rgb2yuv", 0, "RGB -> YUV conversion matrix precision. 0 = maximum")
|
||||
lowres := flag.Uint("lowres", 0, "Feed lowres parameter. Default is full frame")
|
||||
useFloatPipeline := flag.Bool("float", false, "Use float pipeline instead of double, although less precise. Very fast.")
|
||||
useGoPipeline := flag.Bool("use-go-pipeline", false, "Use Go pipeline, although slower. Does not support float mode.")
|
||||
hashOutput := flag.Bool("hash", false, "Hash with SHA256 each output frame for accuracy comparisons")
|
||||
decoderThreads := flag.Uint("decoder-threads", 0, "Threads for JPEG2000 decoding. Defaults to number of logical CPU")
|
||||
pipelineThreads := flag.Uint("pipeline-threads", 0, "Threads for colorspace conversion pipeline. Defaults to number of logical CPU")
|
||||
flag.Parse()
|
||||
|
||||
runtime.KeepAlive(endFrame)
|
||||
|
||||
//C.av_log_set_level(C.AV_LOG_DEBUG)
|
||||
|
||||
numDecoderCpu := int(*decoderThreads)
|
||||
if numDecoderCpu == 0 {
|
||||
numDecoderCpu = runtime.NumCPU()
|
||||
}
|
||||
|
||||
numPipelineCpu := int(*pipelineThreads)
|
||||
if numPipelineCpu == 0 {
|
||||
numPipelineCpu = runtime.NumCPU()
|
||||
}
|
||||
|
||||
useCConverter = !*useGoPipeline
|
||||
useFloat = *useFloatPipeline
|
||||
|
||||
switch strings.ToLower(*colorspaceRelativeSystem) {
|
||||
case "rec709":
|
||||
space = colorspace.SystemRec709
|
||||
rgbGamma = colorspace.GammaRec709
|
||||
case "rec709_pure":
|
||||
space = colorspace.SystemRec709_Pure
|
||||
rgbGamma = colorspace.GammaRec709
|
||||
case "rec709_pure22":
|
||||
space = colorspace.SystemRec709_Pure22
|
||||
rgbGamma = colorspace.Gamma22
|
||||
case "rec709_pure24":
|
||||
space = colorspace.SystemRec709_Pure24
|
||||
rgbGamma = colorspace.Gamma24
|
||||
case "rec2020":
|
||||
space = colorspace.SystemRec2020
|
||||
rgbGamma = colorspace.GammaRec2020
|
||||
case "rec2020_pure":
|
||||
space = colorspace.SystemRec2020_Pure
|
||||
rgbGamma = colorspace.GammaRec2020
|
||||
case "rec2020_pure24":
|
||||
space = colorspace.SystemRec2020_Pure24
|
||||
rgbGamma = colorspace.Gamma24
|
||||
|
||||
default:
|
||||
panic("unsupported colorspace")
|
||||
}
|
||||
|
||||
_, xyz2rgb = space.Chromaticity.ConversionXYZ()
|
||||
_, rgb2yuv = space.YCbCr.ConversionRGB()
|
||||
|
||||
//adjust xyz2rgb with normalization factor from DCI
|
||||
denorm := mat.NewDiagDense(3, []float64{
|
||||
1 / colorspace.DCINormalizationFactor,
|
||||
1 / colorspace.DCINormalizationFactor,
|
||||
1 / colorspace.DCINormalizationFactor,
|
||||
})
|
||||
|
||||
xyz2rgbDenorm = mat.NewDense(3, 3, nil)
|
||||
xyz2rgbDenorm.Mul(denorm, xyz2rgb)
|
||||
|
||||
premult := mat.NewDiagDense(3, []float64{
|
||||
math.MaxUint16,
|
||||
math.MaxUint16,
|
||||
math.MaxUint16,
|
||||
})
|
||||
|
||||
rgb2yuvPremultiplied = mat.NewDense(3, 3, nil)
|
||||
rgb2yuvPremultiplied.Mul(rgb2yuv, premult)
|
||||
|
||||
xyz2rgb = RoundMatToPrecision(xyz2rgb, *xyzPrecision)
|
||||
rgb2yuv = RoundMatToPrecision(rgb2yuv, *rgbPrecision)
|
||||
xyz2rgbDenorm = RoundMatToPrecision(xyz2rgbDenorm, *xyzPrecision)
|
||||
rgb2yuvPremultiplied = RoundMatToPrecision(rgb2yuvPremultiplied, *rgbPrecision)
|
||||
|
||||
_, _ = fmt.Fprintf(os.Stderr, "\nXYZ to RGB matrix:\n%v\n\n", mat.Formatted(xyz2rgb))
|
||||
_, _ = fmt.Fprintf(os.Stderr, "\nXYZ to RGB matrix (denormalized):\n%v\n\n", mat.Formatted(xyz2rgbDenorm))
|
||||
_, _ = fmt.Fprintf(os.Stderr, "\nRGB to YUV matrix:\n%v\n\n", mat.Formatted(rgb2yuv))
|
||||
_, _ = fmt.Fprintf(os.Stderr, "\nRGB to YUV matrix (premultiplied):\n%v\n\n", mat.Formatted(rgb2yuvPremultiplied))
|
||||
|
||||
if useCConverter {
|
||||
_, _ = fmt.Fprintf(os.Stderr, "\nDecoder: CGO %s\n", conv.DecoderInformation())
|
||||
} else {
|
||||
_, _ = fmt.Fprintf(os.Stderr, "\nDecoder: Go Generic scalar pipeline (1d 1f)\n")
|
||||
}
|
||||
|
||||
if useFloat && useCConverter {
|
||||
_, _ = fmt.Fprintf(os.Stderr, "Data type: float32\n\n")
|
||||
} else {
|
||||
_, _ = fmt.Fprintf(os.Stderr, "Data type: float64\n\n")
|
||||
}
|
||||
|
||||
if useCConverter {
|
||||
conv.InitData(xyz2rgbDenorm, rgb2yuvPremultiplied, colorspace.GammaDCIXYZ, rgbGamma)
|
||||
}
|
||||
|
||||
//open and write output file header
|
||||
var output *os.File
|
||||
if *outFile == "-" {
|
||||
output = os.Stdout
|
||||
} else {
|
||||
f, err := os.Create(*outFile)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
output = f
|
||||
}
|
||||
defer output.Close()
|
||||
|
||||
outputFrame := func(number int, y, cb, cr []uint16) error {
|
||||
by := unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(y))), len(y)*2)
|
||||
bcb := unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(cb))), len(cb)*2)
|
||||
bcr := unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(cr))), len(cr)*2)
|
||||
if *hashOutput {
|
||||
hasher := sha256.New()
|
||||
hasher.Write(by)
|
||||
hasher.Write(bcb)
|
||||
hasher.Write(bcr)
|
||||
fmt.Fprintf(os.Stderr, "\r%s\n", hex.EncodeToString(hasher.Sum(nil)))
|
||||
//fmt.Fprintf(os.Stderr, "\rFrame %d: %s\n", number, hex.EncodeToString(hasher.Sum(nil)))
|
||||
}
|
||||
|
||||
_, err := output.WriteString("FRAME\n")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = output.Write(by)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = output.Write(bcb)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = output.Write(bcr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// decode and processing loop
|
||||
var wg sync.WaitGroup
|
||||
|
||||
availableFrames := make(chan *frameJobData, numPipelineCpu*2)
|
||||
inFrameJobs := make(chan *frameJobData, numPipelineCpu)
|
||||
outFrameJobs := make(chan *frameJobData, numPipelineCpu)
|
||||
|
||||
availableDecoders := make(chan struct{}, numDecoderCpu*2)
|
||||
outDecoderJobs := make(chan *decodedFrame, numPipelineCpu)
|
||||
jpegDecoderChannel := make(chan libav.PacketData, numDecoderCpu)
|
||||
|
||||
var expectedFrame = max(0, *startFrame)
|
||||
var expectedFrameDecoder = expectedFrame
|
||||
var processedFrames atomic.Uint64
|
||||
|
||||
var firstFrame = expectedFrame
|
||||
var firstFrameTime time.Time
|
||||
|
||||
var wg2 sync.WaitGroup
|
||||
for i := 0; i < numPipelineCpu; i++ {
|
||||
wg2.Add(1)
|
||||
go func() {
|
||||
defer wg2.Done()
|
||||
for job := range inFrameJobs {
|
||||
job.Process()
|
||||
processedFrames.Add(1)
|
||||
outFrameJobs <- job
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
wg2.Wait()
|
||||
close(outFrameJobs)
|
||||
}()
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
outputs := make([]*frameJobData, 0)
|
||||
for out := range outFrameJobs {
|
||||
outputs = append(outputs, out)
|
||||
slices.SortFunc(outputs, func(a, b *frameJobData) int {
|
||||
return a.frame - b.frame
|
||||
})
|
||||
|
||||
for len(outputs) > 0 {
|
||||
f := outputs[0]
|
||||
|
||||
if f.frame != int(expectedFrame) {
|
||||
break
|
||||
}
|
||||
|
||||
//output frame to file
|
||||
err := outputFrame(f.frame, f.y, f.cb, f.cr)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
outputs = slices.Delete(outputs, 0, 1)
|
||||
|
||||
expectedFrame++
|
||||
|
||||
availableFrames <- f
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
decoder, err := libopenjp2.NewJpeg2000Decoder(libopenjp2.QualityLayersAll, *lowres)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
var streamFramerateNum, streamFramerateDen, streamSarNum, streamSarDen int
|
||||
|
||||
var onceInit sync.Once
|
||||
var wgDecoder sync.WaitGroup
|
||||
for i := 0; i < numDecoderCpu*2; i++ {
|
||||
wg.Add(1)
|
||||
wgDecoder.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
defer wgDecoder.Add(1)
|
||||
|
||||
for p := range jpegDecoderChannel {
|
||||
|
||||
frame, err := decoder.DecodeFrame(p.Data)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
onceInit.Do(func() {
|
||||
_, err := output.WriteString(fmt.Sprintf("YUV4MPEG2 W%d H%d F%d:%d I%s A%d:%d%s%s\n",
|
||||
frame.Width,
|
||||
frame.Height,
|
||||
streamFramerateNum,
|
||||
streamFramerateDen,
|
||||
"p",
|
||||
streamSarNum,
|
||||
streamSarDen,
|
||||
" C444p16 XYSCSS=444P16",
|
||||
" XCOLORRANGE=FULL",
|
||||
))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
yuvLineSize := frame.Width
|
||||
yuvFrameSize := frame.Height * yuvLineSize
|
||||
|
||||
for i := 0; i < numPipelineCpu*2; i++ {
|
||||
availableFrames <- &frameJobData{
|
||||
wg: &wg,
|
||||
frame: 0,
|
||||
width: frame.Width,
|
||||
height: frame.Height,
|
||||
in: nil,
|
||||
//add extra capacity for OOB writes in ASM code
|
||||
y: make([]uint16, yuvFrameSize, yuvFrameSize+64),
|
||||
cb: make([]uint16, yuvFrameSize, yuvFrameSize+64),
|
||||
cr: make([]uint16, yuvFrameSize, yuvFrameSize+64),
|
||||
}
|
||||
}
|
||||
firstFrameTime = time.Now().UTC()
|
||||
})
|
||||
|
||||
outDecoderJobs <- &decodedFrame{
|
||||
Number: p.Number,
|
||||
//add extra capacity for OOB reads in ASM code
|
||||
Frame: ToPacked(frame.X, frame.Y, frame.Z, 256),
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
defer close(inFrameJobs)
|
||||
|
||||
outputs := make([]*decodedFrame, 0)
|
||||
for out := range outDecoderJobs {
|
||||
outputs = append(outputs, out)
|
||||
slices.SortFunc(outputs, func(a, b *decodedFrame) int {
|
||||
return a.Number - b.Number
|
||||
})
|
||||
|
||||
for len(outputs) > 0 {
|
||||
frame := outputs[0]
|
||||
|
||||
if frame.Number != int(expectedFrameDecoder) {
|
||||
break
|
||||
}
|
||||
|
||||
f := <-availableFrames
|
||||
f.frame = frame.Number
|
||||
//f.inLineSize = linesize
|
||||
|
||||
f.in = frame.Frame
|
||||
wg.Add(1)
|
||||
inFrameJobs <- f
|
||||
|
||||
outputs = slices.Delete(outputs, 0, 1)
|
||||
|
||||
expectedFrameDecoder++
|
||||
|
||||
availableDecoders <- struct{}{}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for range time.Tick(time.Second) {
|
||||
frame := int(processedFrames.Load())
|
||||
runningTime := time.Now().UTC().Sub(firstFrameTime)
|
||||
fps := float64(frame-int(firstFrame)+1) / runningTime.Seconds()
|
||||
_, _ = fmt.Fprintf(os.Stderr, "\rFrames %d %.02f fps %s ", frame, fps, runningTime.Truncate(time.Second))
|
||||
}
|
||||
}()
|
||||
|
||||
err = libav.OpenXYZ12(*inFile, func(framerateNum, framerateDen, sarNum, sarDen, width, height int) error {
|
||||
streamFramerateNum = framerateNum
|
||||
streamFramerateDen = framerateDen
|
||||
streamSarNum = sarNum
|
||||
streamSarDen = sarDen
|
||||
|
||||
for i := 0; i < numDecoderCpu*2; i++ {
|
||||
availableDecoders <- struct{}{}
|
||||
}
|
||||
|
||||
return nil
|
||||
}, func(p libav.PacketData) error {
|
||||
if uint64(p.Number) < *startFrame {
|
||||
firstFrameTime = time.Now().UTC()
|
||||
return nil
|
||||
}
|
||||
|
||||
if uint64(p.Number) >= *endFrame {
|
||||
return io.EOF
|
||||
}
|
||||
|
||||
<-availableDecoders
|
||||
|
||||
jpegDecoderChannel <- p
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
close(jpegDecoderChannel)
|
||||
wgDecoder.Wait()
|
||||
close(outDecoderJobs)
|
||||
|
||||
wg.Wait()
|
||||
|
||||
print("\n\n")
|
||||
|
||||
runningTime := time.Now().UTC().Sub(firstFrameTime)
|
||||
fps := float64(int(processedFrames.Load())-int(firstFrame)+1) / runningTime.Seconds()
|
||||
_, _ = fmt.Fprintf(os.Stderr, "\nTotal %d frames, %.02f fps, took %s \n", processedFrames.Load(), fps, runningTime.Truncate(time.Millisecond))
|
||||
|
||||
}
|
Loading…
Reference in a new issue