Implement LPC Synthesis
Some data makes it all the way through. LTP Synthesis needs to be implemented now
This commit is contained in:
parent
f20a9eedf1
commit
b2df0354df
16
decoder.go
16
decoder.go
|
@ -17,9 +17,9 @@ func NewDecoder() *Decoder {
|
|||
}
|
||||
|
||||
// Decode decodes the Opus bitstream into PCM
|
||||
func (d *Decoder) Decode(in []byte) (bandwidth Bandwidth, isStereo bool, frames [][]byte, err error) {
|
||||
func (d *Decoder) Decode(in []byte, out []float64) (bandwidth Bandwidth, isStereo bool, err error) {
|
||||
if len(in) < 1 {
|
||||
return 0, false, nil, errTooShortForTableOfContentsHeader
|
||||
return 0, false, errTooShortForTableOfContentsHeader
|
||||
}
|
||||
|
||||
tocHeader := tableOfContentsHeader(in[0])
|
||||
|
@ -30,21 +30,19 @@ func (d *Decoder) Decode(in []byte) (bandwidth Bandwidth, isStereo bool, frames
|
|||
case frameCodeOneFrame:
|
||||
encodedFrames = append(encodedFrames, in[1:])
|
||||
default:
|
||||
return 0, false, nil, fmt.Errorf("%w: %d", errUnsupportedFrameCode, tocHeader.frameCode())
|
||||
return 0, false, fmt.Errorf("%w: %d", errUnsupportedFrameCode, tocHeader.frameCode())
|
||||
}
|
||||
|
||||
if cfg.mode() != configurationModeSilkOnly {
|
||||
return 0, false, nil, fmt.Errorf("%w: %d", errUnsupportedConfigurationMode, cfg.mode())
|
||||
return 0, false, fmt.Errorf("%w: %d", errUnsupportedConfigurationMode, cfg.mode())
|
||||
}
|
||||
|
||||
for _, encodedFrame := range encodedFrames {
|
||||
decoded, err := d.silkDecoder.Decode(encodedFrame, []byte{}, tocHeader.isStereo(), cfg.frameDuration().nanoseconds(), silk.Bandwidth(cfg.bandwidth()))
|
||||
err := d.silkDecoder.Decode(encodedFrame, out, tocHeader.isStereo(), cfg.frameDuration().nanoseconds(), silk.Bandwidth(cfg.bandwidth()))
|
||||
if err != nil {
|
||||
return 0, false, nil, err
|
||||
return 0, false, err
|
||||
}
|
||||
|
||||
frames = append(frames, decoded)
|
||||
}
|
||||
|
||||
return cfg.bandwidth(), tocHeader.isStereo(), frames, nil
|
||||
return cfg.bandwidth(), tocHeader.isStereo(), nil
|
||||
}
|
||||
|
|
|
@ -29,6 +29,7 @@ func main() {
|
|||
panic(err)
|
||||
}
|
||||
|
||||
out := make([]float64, 1500)
|
||||
for {
|
||||
pageData, _, err := ogg.ParseNextPage()
|
||||
if errors.Is(err, io.EOF) {
|
||||
|
@ -41,11 +42,11 @@ func main() {
|
|||
panic(err)
|
||||
}
|
||||
|
||||
bandwidth, isStereo, frames, err := decoder.Decode(pageData)
|
||||
bandwidth, isStereo, err := decoder.Decode(pageData, out)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
fmt.Printf("bandwidth(%s) isStereo(%t) framesCount(%d)\n", bandwidth.String(), isStereo, len(frames))
|
||||
fmt.Printf("bandwidth(%s) isStereo(%t) framesCount(%d)\n", bandwidth.String(), isStereo, len(out))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ type Decoder struct {
|
|||
// TODO, should have dedicated frame state
|
||||
logGain uint32
|
||||
subframeState [4]struct {
|
||||
gain float64
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -95,13 +94,15 @@ func (d *Decoder) determineFrameType(voiceActivityDetected bool) (signalType fra
|
|||
// A separate quantization gain is coded for each 5 ms subframe
|
||||
//
|
||||
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.4
|
||||
func (d *Decoder) decodeSubframeQuantizations(signalType frameSignalType) {
|
||||
func (d *Decoder) decodeSubframeQuantizations(signalType frameSignalType) (gainQ16 []float64) {
|
||||
var (
|
||||
logGain uint32
|
||||
deltaGainIndex uint32
|
||||
gainIndex uint32
|
||||
)
|
||||
|
||||
gainQ16 = make([]float64, 4)
|
||||
|
||||
for subframeIndex := 0; subframeIndex < 4; subframeIndex++ {
|
||||
|
||||
//The subframe gains are either coded independently, or relative to the
|
||||
|
@ -168,9 +169,10 @@ func (d *Decoder) decodeSubframeQuantizations(signalType frameSignalType) {
|
|||
// between 81920 and 1686110208, inclusive (representing scale factors
|
||||
// of 1.25 to 25728, respectively).
|
||||
|
||||
gainQ16 := (1 << i) + ((-174*f*(128-f)>>16)+f)*((1<<i)>>7)
|
||||
d.subframeState[subframeIndex].gain = float64(gainQ16) / 65536
|
||||
gainQ16[subframeIndex] = float64((1 << i) + ((-174*f*(128-f)>>16)+f)*((1<<i)>>7))
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// A set of normalized Line Spectral Frequency (LSF) coefficients follow
|
||||
|
@ -209,7 +211,7 @@ func (d *Decoder) normalizeLineSpectralFrequencyStageOne(voiceActivityDetected b
|
|||
// Predictive Coding (LPC) coefficients for the current SILK frame.
|
||||
//
|
||||
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.2
|
||||
func (d *Decoder) normalizeLineSpectralFrequencyStageTwo(bandwidth Bandwidth, I1 uint32) (resQ10 []int16) {
|
||||
func (d *Decoder) normalizeLineSpectralFrequencyStageTwo(bandwidth Bandwidth, I1 uint32) (dLPC int, resQ10 []int16) {
|
||||
// Decoding the second stage residual proceeds as follows. For each
|
||||
// coefficient, the decoder reads a symbol using the PDF corresponding
|
||||
// to I1 from either Table 17 or Table 18,
|
||||
|
@ -265,7 +267,7 @@ func (d *Decoder) normalizeLineSpectralFrequencyStageTwo(bandwidth Bandwidth, I1
|
|||
resQ10 = make([]int16, len(I2))
|
||||
|
||||
// Let d_LPC be the order of the codebook, i.e., 10 for NB and MB, and 16 for WB
|
||||
dLPC := len(I2)
|
||||
dLPC = len(I2)
|
||||
|
||||
// for 0 <= k < d_LPC-1
|
||||
for k := dLPC - 2; k >= 0; k-- {
|
||||
|
@ -309,10 +311,7 @@ func (d *Decoder) normalizeLineSpectralFrequencyStageTwo(bandwidth Bandwidth, I1
|
|||
// reconstructed.
|
||||
//
|
||||
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.3
|
||||
func (d *Decoder) normalizeLineSpectralFrequencyCoefficients(bandwidth Bandwidth, resQ10 []int16, I1 uint32) (nlsfQ15 []int16) {
|
||||
// Let d_LPC be the order of the codebook, i.e., 10 for NB and MB, and 16 for WB
|
||||
dLPC := len(resQ10)
|
||||
|
||||
func (d *Decoder) normalizeLineSpectralFrequencyCoefficients(dLPC int, bandwidth Bandwidth, resQ10 []int16, I1 uint32) (nlsfQ15 []int16) {
|
||||
nlsfQ15 = make([]int16, dLPC)
|
||||
w2Q18 := make([]uint, dLPC)
|
||||
wQ9 := make([]int16, dLPC)
|
||||
|
@ -1116,12 +1115,97 @@ func (d *Decoder) decodeLTPFilterCoefficients(signalType frameSignalType) error
|
|||
return nil
|
||||
}
|
||||
|
||||
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.1
|
||||
func (d *Decoder) ltpSynthesis() {
|
||||
func (d *Decoder) generateResValue(eQ23 []int32) []float64 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// let n be the number of samples in a subframe (40 for NB, 60 for
|
||||
// MB, and 80 for WB)
|
||||
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9
|
||||
func (d *Decoder) samplesInSubframe(bandwidth Bandwidth) int {
|
||||
switch bandwidth {
|
||||
case BandwidthNarrowband:
|
||||
return 40
|
||||
case BandwidthMediumband:
|
||||
return 60
|
||||
case BandwidthWideband:
|
||||
return 80
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.1
|
||||
func (d *Decoder) ltpSynthesis(signalType frameSignalType, eQ23 []int32) (res []float64) {
|
||||
// For unvoiced frames (see Section 4.2.7.3), the LPC residual for i
|
||||
// such that j <= i < (j + n) is simply a normalized copy of the
|
||||
// excitation signal, i.e.,
|
||||
//
|
||||
// e_Q23[i]
|
||||
// res[i] = ---------
|
||||
// 2.0**23
|
||||
|
||||
res = make([]float64, len(eQ23))
|
||||
if signalType != frameSignalTypeVoiced {
|
||||
for i := range eQ23 {
|
||||
res[i] = float64(eQ23[i]) / 8388608
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// LPC synthesis uses the short-term LPC filter to predict the next
|
||||
// output coefficient. For i such that (j - d_LPC) <= i < j, let lpc[i]
|
||||
// be the result of LPC synthesis from the last d_LPC samples of the
|
||||
// previous subframe or zeros in the first subframe for this channel
|
||||
// after either
|
||||
//
|
||||
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.2
|
||||
func (d *Decoder) lpcSynthesis() {
|
||||
func (d *Decoder) lpcSynthesis(out []float64, bandwidth Bandwidth, dLPC int, aQ12, res, gainQ16 []float64) {
|
||||
// let n be the number of samples in a subframe
|
||||
n := d.samplesInSubframe(bandwidth)
|
||||
|
||||
// j be the index of the first sample in the residual corresponding to
|
||||
// the current subframe.
|
||||
j := 0
|
||||
|
||||
// let lpc[i] be the result of LPC synthesis from the last d_LPC samples of the
|
||||
// previous subframe or zeros in the first subframe for this channel
|
||||
lpc := make([]float64, n)
|
||||
|
||||
//Then, for i such that j <= i < (j + n), the result of LPC synthesis
|
||||
//for the current subframe is
|
||||
//
|
||||
// d_LPC-1
|
||||
// gain_Q16[i] __ a_Q12[k]
|
||||
// lpc[i] = ----------- * res[i] + \ lpc[i-k-1] * --------
|
||||
// 65536.0 /_ 4096.0
|
||||
// k=0
|
||||
//
|
||||
for i := j; i < (j + n); i++ {
|
||||
lpcVal := gainQ16[0] / 65536.0
|
||||
lpcVal *= res[i]
|
||||
for k := 0; k < dLPC; k++ {
|
||||
if i-k > 0 {
|
||||
lpcVal += lpc[i-k-1] * (aQ12[k] / 4096.0)
|
||||
}
|
||||
}
|
||||
|
||||
lpc[i] = lpcVal
|
||||
|
||||
// The decoder saves the final d_LPC values, i.e., lpc[i] such that
|
||||
// (j + n - d_LPC) <= i < (j + n), to feed into the LPC synthesis of the
|
||||
// next subframe. This requires storage for up to 16 values of lpc[i]
|
||||
// (for WB frames).
|
||||
|
||||
// Then, the signal is clamped into the final nominal range:
|
||||
//
|
||||
// out[i] = clamp(-1.0, lpc[i], 1.0)
|
||||
//
|
||||
out[i] = clampFloat(-1.0, lpc[i], 1.0)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Decode decodes many SILK subframes
|
||||
|
@ -1156,33 +1240,33 @@ func (d *Decoder) lpcSynthesis() {
|
|||
// 8: Resampled signal
|
||||
//
|
||||
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.1
|
||||
func (d *Decoder) Decode(in, out []byte, isStereo bool, nanoseconds int, bandwidth Bandwidth) ([]byte, error) {
|
||||
func (d *Decoder) Decode(in []byte, out []float64, isStereo bool, nanoseconds int, bandwidth Bandwidth) error {
|
||||
if nanoseconds != nanoseconds20Ms {
|
||||
return nil, errUnsupportedSilkFrameDuration
|
||||
return errUnsupportedSilkFrameDuration
|
||||
} else if isStereo {
|
||||
return nil, errUnsupportedSilkStereo
|
||||
return errUnsupportedSilkStereo
|
||||
}
|
||||
|
||||
d.rangeDecoder.Init(in)
|
||||
|
||||
voiceActivityDetected, lowBitRateRedundancy := d.decodeHeaderBits()
|
||||
if lowBitRateRedundancy {
|
||||
return nil, errUnsupportedSilkLowBitrateRedundancy
|
||||
return errUnsupportedSilkLowBitrateRedundancy
|
||||
}
|
||||
|
||||
signalType, quantizationOffsetType := d.determineFrameType(voiceActivityDetected)
|
||||
|
||||
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.4
|
||||
d.decodeSubframeQuantizations(signalType)
|
||||
gainQ16 := d.decodeSubframeQuantizations(signalType)
|
||||
|
||||
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.1
|
||||
I1 := d.normalizeLineSpectralFrequencyStageOne(voiceActivityDetected, bandwidth)
|
||||
|
||||
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.2
|
||||
resQ10 := d.normalizeLineSpectralFrequencyStageTwo(bandwidth, I1)
|
||||
dLPC, resQ10 := d.normalizeLineSpectralFrequencyStageTwo(bandwidth, I1)
|
||||
|
||||
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.3
|
||||
nlsfQ15 := d.normalizeLineSpectralFrequencyCoefficients(bandwidth, resQ10, I1)
|
||||
nlsfQ15 := d.normalizeLineSpectralFrequencyCoefficients(dLPC, bandwidth, resQ10, I1)
|
||||
|
||||
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.4
|
||||
d.normalizeLSFStabilization(nlsfQ15)
|
||||
|
@ -1190,7 +1274,7 @@ func (d *Decoder) Decode(in, out []byte, isStereo bool, nanoseconds int, bandwid
|
|||
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.5
|
||||
n1Q15, err := d.normalizeLSFInterpolation(nlsfQ15)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.6
|
||||
|
@ -1200,22 +1284,22 @@ func (d *Decoder) Decode(in, out []byte, isStereo bool, nanoseconds int, bandwid
|
|||
d.limitLPCCoefficientsRange(a32Q17)
|
||||
|
||||
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.5.8
|
||||
d.limitLPCFilterPredictionGain(a32Q17)
|
||||
aQ12 := d.limitLPCFilterPredictionGain(a32Q17)
|
||||
|
||||
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.1
|
||||
if err := d.decodePitchLags(signalType); err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.2
|
||||
if err := d.decodeLTPFilterCoefficients(signalType); err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.3
|
||||
_, err = d.decodeLTPScalingParamater(signalType)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.7
|
||||
|
@ -1231,14 +1315,14 @@ func (d *Decoder) Decode(in, out []byte, isStereo bool, nanoseconds int, bandwid
|
|||
pulsecounts, lsbcounts := d.decodePulseAndLSBCounts(shellblocks, rateLevel)
|
||||
|
||||
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.8.6
|
||||
d.decodeExcitation(signalType, quantizationOffsetType, lcgSeed, pulsecounts, lsbcounts)
|
||||
eQ23 := d.decodeExcitation(signalType, quantizationOffsetType, lcgSeed, pulsecounts, lsbcounts)
|
||||
|
||||
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.1
|
||||
d.ltpSynthesis()
|
||||
res := d.ltpSynthesis(signalType, eQ23)
|
||||
|
||||
//https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.2
|
||||
d.lpcSynthesis()
|
||||
d.lpcSynthesis(out, bandwidth, dLPC, aQ12, res, gainQ16)
|
||||
|
||||
d.isPreviousFrameVoiced = signalType == frameSignalTypeVoiced
|
||||
return out, nil
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ func createRangeDecoder(data []byte, bitsRead uint, rangeSize uint32, highAndCod
|
|||
|
||||
func TestDecode20MsOnly(t *testing.T) {
|
||||
d := &Decoder{}
|
||||
_, err := d.Decode(testSilkFrame(), []byte{}, false, 1, BandwidthWideband)
|
||||
err := d.Decode(testSilkFrame(), []float64{}, false, 1, BandwidthWideband)
|
||||
if !errors.Is(err, errUnsupportedSilkFrameDuration) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ func TestDecode20MsOnly(t *testing.T) {
|
|||
|
||||
func TestDecodeStereoTODO(t *testing.T) {
|
||||
d := &Decoder{}
|
||||
_, err := d.Decode(testSilkFrame(), []byte{}, true, nanoseconds20Ms, BandwidthWideband)
|
||||
err := d.Decode(testSilkFrame(), []float64{}, true, nanoseconds20Ms, BandwidthWideband)
|
||||
if !errors.Is(err, errUnsupportedSilkStereo) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
@ -57,16 +57,15 @@ func TestDecodeFrameType(t *testing.T) {
|
|||
func TestDecodeSubframeQuantizations(t *testing.T) {
|
||||
d := &Decoder{rangeDecoder: createRangeDecoder(testSilkFrame(), 31, 482344960, 437100388)}
|
||||
|
||||
d.decodeSubframeQuantizations(frameSignalTypeInactive)
|
||||
|
||||
gainQ16 := d.decodeSubframeQuantizations(frameSignalTypeInactive)
|
||||
switch {
|
||||
case d.subframeState[0].gain != 3.21875:
|
||||
case gainQ16[0] != 210944:
|
||||
t.Fatal()
|
||||
case d.subframeState[1].gain != 1.71875:
|
||||
case gainQ16[1] != 112640:
|
||||
t.Fatal()
|
||||
case d.subframeState[2].gain != 1.46875:
|
||||
case gainQ16[2] != 96256:
|
||||
t.Fatal()
|
||||
case d.subframeState[3].gain != 1.46875:
|
||||
case gainQ16[3] != 96256:
|
||||
t.Fatal()
|
||||
}
|
||||
}
|
||||
|
@ -83,16 +82,18 @@ func TestNormalizeLineSpectralFrequencyStageOne(t *testing.T) {
|
|||
func TestNormalizeLineSpectralFrequencyStageTwo(t *testing.T) {
|
||||
d := &Decoder{rangeDecoder: createRangeDecoder(testSilkFrame(), 47, 50822640, 5895957)}
|
||||
|
||||
resQ10 := d.normalizeLineSpectralFrequencyStageTwo(BandwidthWideband, 9)
|
||||
dLPC, resQ10 := d.normalizeLineSpectralFrequencyStageTwo(BandwidthWideband, 9)
|
||||
if !reflect.DeepEqual(resQ10, testResQ10()) {
|
||||
t.Fatal()
|
||||
} else if dLPC != 16 {
|
||||
t.Fatal()
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeLineSpectralFrequencyCoefficients(t *testing.T) {
|
||||
d := &Decoder{rangeDecoder: createRangeDecoder(testSilkFrame(), 55, 493249168, 174371199)}
|
||||
|
||||
nlsfQ1 := d.normalizeLineSpectralFrequencyCoefficients(BandwidthWideband, testResQ10(), 9)
|
||||
nlsfQ1 := d.normalizeLineSpectralFrequencyCoefficients(16, BandwidthWideband, testResQ10(), 9)
|
||||
if !reflect.DeepEqual(nlsfQ1, testNlsfQ1()) {
|
||||
t.Fatal()
|
||||
}
|
||||
|
|
|
@ -63,6 +63,16 @@ func clamp(low, in, high int32) int32 {
|
|||
return in
|
||||
}
|
||||
|
||||
func clampFloat(low, in, high float64) float64 {
|
||||
if in > high {
|
||||
return high
|
||||
} else if in < low {
|
||||
return low
|
||||
}
|
||||
|
||||
return in
|
||||
}
|
||||
|
||||
// The sign of x, i.e.,
|
||||
// ( -1, x < 0
|
||||
// sign(x) = < 0, x == 0
|
||||
|
|
Loading…
Reference in a new issue