Implement LPC Synthesis

Some data makes it all the way through. LTP Synthesis needs to be
implemented now
This commit is contained in:
Sean DuBois 2022-09-06 23:49:51 -04:00
parent f20a9eedf1
commit b2df0354df
5 changed files with 144 additions and 50 deletions

View file

@ -17,9 +17,9 @@ func NewDecoder() *Decoder {
}
// Decode decodes the Opus bitstream into PCM
func (d *Decoder) Decode(in []byte) (bandwidth Bandwidth, isStereo bool, frames [][]byte, err error) {
func (d *Decoder) Decode(in []byte, out []float64) (bandwidth Bandwidth, isStereo bool, err error) {
if len(in) < 1 {
return 0, false, nil, errTooShortForTableOfContentsHeader
return 0, false, errTooShortForTableOfContentsHeader
}
tocHeader := tableOfContentsHeader(in[0])
@ -30,21 +30,19 @@ func (d *Decoder) Decode(in []byte) (bandwidth Bandwidth, isStereo bool, frames
case frameCodeOneFrame:
encodedFrames = append(encodedFrames, in[1:])
default:
return 0, false, nil, fmt.Errorf("%w: %d", errUnsupportedFrameCode, tocHeader.frameCode())
return 0, false, fmt.Errorf("%w: %d", errUnsupportedFrameCode, tocHeader.frameCode())
}
if cfg.mode() != configurationModeSilkOnly {
return 0, false, nil, fmt.Errorf("%w: %d", errUnsupportedConfigurationMode, cfg.mode())
return 0, false, fmt.Errorf("%w: %d", errUnsupportedConfigurationMode, cfg.mode())
}
for _, encodedFrame := range encodedFrames {
decoded, err := d.silkDecoder.Decode(encodedFrame, []byte{}, tocHeader.isStereo(), cfg.frameDuration().nanoseconds(), silk.Bandwidth(cfg.bandwidth()))
err := d.silkDecoder.Decode(encodedFrame, out, tocHeader.isStereo(), cfg.frameDuration().nanoseconds(), silk.Bandwidth(cfg.bandwidth()))
if err != nil {
return 0, false, nil, err
return 0, false, err
}
frames = append(frames, decoded)
}
return cfg.bandwidth(), tocHeader.isStereo(), frames, nil
return cfg.bandwidth(), tocHeader.isStereo(), nil
}

View file

@ -29,6 +29,7 @@ func main() {
panic(err)
}
out := make([]float64, 1500)
for {
pageData, _, err := ogg.ParseNextPage()
if errors.Is(err, io.EOF) {
@ -41,11 +42,11 @@ func main() {
panic(err)
}
bandwidth, isStereo, frames, err := decoder.Decode(pageData)
bandwidth, isStereo, err := decoder.Decode(pageData, out)
if err != nil {
panic(err)
}
fmt.Printf("bandwidth(%s) isStereo(%t) framesCount(%d)\n", bandwidth.String(), isStereo, len(frames))
fmt.Printf("bandwidth(%s) isStereo(%t) framesCount(%d)\n", bandwidth.String(), isStereo, len(out))
}
}

View file

@ -18,7 +18,6 @@ type Decoder struct {
// TODO, should have dedicated frame state
logGain uint32
subframeState [4]struct {
gain float64
}
}
@ -95,13 +94,15 @@ func (d *Decoder) determineFrameType(voiceActivityDetected bool) (signalType fra
// A separate quantization gain is coded for each 5 ms subframe
//
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.4
func (d *Decoder) decodeSubframeQuantizations(signalType frameSignalType) {
func (d *Decoder) decodeSubframeQuantizations(signalType frameSignalType) (gainQ16 []float64) {
var (
logGain uint32
deltaGainIndex uint32
gainIndex uint32
)
gainQ16 = make([]float64, 4)
for subframeIndex := 0; subframeIndex < 4; subframeIndex++ {
//The subframe gains are either coded independently, or relative to the
@ -168,9 +169,10 @@ func (d *Decoder) decodeSubframeQuantizations(signalType frameSignalType) {
// between 81920 and 1686110208, inclusive (representing scale factors
// of 1.25 to 25728, respectively).
gainQ16 := (1 << i) + ((-174*f*(128-f)>>16)+f)*((1<<i)>>7)
d.subframeState[subframeIndex].gain = float64(gainQ16) / 65536
gainQ16[subframeIndex] = float64((1 << i) + ((-174*f*(128-f)>>16)+f)*((1<<i)>>7))
}
return
}
// A set of normalized Line Spectral Frequency (LSF) coefficients follow
@ -209,7 +211,7 @@ func (d *Decoder) normalizeLineSpectralFrequencyStageOne(voiceActivityDetected b
// Predictive Coding (LPC) coefficients for the current SILK frame.
//
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.2
func (d *Decoder) normalizeLineSpectralFrequencyStageTwo(bandwidth Bandwidth, I1 uint32) (resQ10 []int16) {
func (d *Decoder) normalizeLineSpectralFrequencyStageTwo(bandwidth Bandwidth, I1 uint32) (dLPC int, resQ10 []int16) {
// Decoding the second stage residual proceeds as follows. For each
// coefficient, the decoder reads a symbol using the PDF corresponding
// to I1 from either Table 17 or Table 18,
@ -265,7 +267,7 @@ func (d *Decoder) normalizeLineSpectralFrequencyStageTwo(bandwidth Bandwidth, I1
resQ10 = make([]int16, len(I2))
// Let d_LPC be the order of the codebook, i.e., 10 for NB and MB, and 16 for WB
dLPC := len(I2)
dLPC = len(I2)
// for 0 <= k < d_LPC-1
for k := dLPC - 2; k >= 0; k-- {
@ -309,10 +311,7 @@ func (d *Decoder) normalizeLineSpectralFrequencyStageTwo(bandwidth Bandwidth, I1
// reconstructed.
//
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.3
func (d *Decoder) normalizeLineSpectralFrequencyCoefficients(bandwidth Bandwidth, resQ10 []int16, I1 uint32) (nlsfQ15 []int16) {
// Let d_LPC be the order of the codebook, i.e., 10 for NB and MB, and 16 for WB
dLPC := len(resQ10)
func (d *Decoder) normalizeLineSpectralFrequencyCoefficients(dLPC int, bandwidth Bandwidth, resQ10 []int16, I1 uint32) (nlsfQ15 []int16) {
nlsfQ15 = make([]int16, dLPC)
w2Q18 := make([]uint, dLPC)
wQ9 := make([]int16, dLPC)
@ -1116,12 +1115,97 @@ func (d *Decoder) decodeLTPFilterCoefficients(signalType frameSignalType) error
return nil
}
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.1
func (d *Decoder) ltpSynthesis() {
func (d *Decoder) generateResValue(eQ23 []int32) []float64 {
return nil
}
// let n be the number of samples in a subframe (40 for NB, 60 for
// MB, and 80 for WB)
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9
func (d *Decoder) samplesInSubframe(bandwidth Bandwidth) int {
switch bandwidth {
case BandwidthNarrowband:
return 40
case BandwidthMediumband:
return 60
case BandwidthWideband:
return 80
}
return 0
}
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.1
func (d *Decoder) ltpSynthesis(signalType frameSignalType, eQ23 []int32) (res []float64) {
// For unvoiced frames (see Section 4.2.7.3), the LPC residual for i
// such that j <= i < (j + n) is simply a normalized copy of the
// excitation signal, i.e.,
//
// e_Q23[i]
// res[i] = ---------
// 2.0**23
res = make([]float64, len(eQ23))
if signalType != frameSignalTypeVoiced {
for i := range eQ23 {
res[i] = float64(eQ23[i]) / 8388608
}
}
return
}
// LPC synthesis uses the short-term LPC filter to predict the next
// output coefficient. For i such that (j - d_LPC) <= i < j, let lpc[i]
// be the result of LPC synthesis from the last d_LPC samples of the
// previous subframe or zeros in the first subframe for this channel
// after either
//
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.2
func (d *Decoder) lpcSynthesis() {
func (d *Decoder) lpcSynthesis(out []float64, bandwidth Bandwidth, dLPC int, aQ12, res, gainQ16 []float64) {
// let n be the number of samples in a subframe
n := d.samplesInSubframe(bandwidth)
// j be the index of the first sample in the residual corresponding to
// the current subframe.
j := 0
// let lpc[i] be the result of LPC synthesis from the last d_LPC samples of the
// previous subframe or zeros in the first subframe for this channel
lpc := make([]float64, n)
//Then, for i such that j <= i < (j + n), the result of LPC synthesis
//for the current subframe is
//
// d_LPC-1
// gain_Q16[i] __ a_Q12[k]
// lpc[i] = ----------- * res[i] + \ lpc[i-k-1] * --------
// 65536.0 /_ 4096.0
// k=0
//
for i := j; i < (j + n); i++ {
lpcVal := gainQ16[0] / 65536.0
lpcVal *= res[i]
for k := 0; k < dLPC; k++ {
if i-k > 0 {
lpcVal += lpc[i-k-1] * (aQ12[k] / 4096.0)
}
}
lpc[i] = lpcVal
// The decoder saves the final d_LPC values, i.e., lpc[i] such that
// (j + n - d_LPC) <= i < (j + n), to feed into the LPC synthesis of the
// next subframe. This requires storage for up to 16 values of lpc[i]
// (for WB frames).
// Then, the signal is clamped into the final nominal range:
//
// out[i] = clamp(-1.0, lpc[i], 1.0)
//
out[i] = clampFloat(-1.0, lpc[i], 1.0)
}
}
// Decode decodes many SILK subframes
@ -1156,33 +1240,33 @@ func (d *Decoder) lpcSynthesis() {
// 8: Resampled signal
//
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.1
func (d *Decoder) Decode(in, out []byte, isStereo bool, nanoseconds int, bandwidth Bandwidth) ([]byte, error) {
func (d *Decoder) Decode(in []byte, out []float64, isStereo bool, nanoseconds int, bandwidth Bandwidth) error {
if nanoseconds != nanoseconds20Ms {
return nil, errUnsupportedSilkFrameDuration
return errUnsupportedSilkFrameDuration
} else if isStereo {
return nil, errUnsupportedSilkStereo
return errUnsupportedSilkStereo
}
d.rangeDecoder.Init(in)
voiceActivityDetected, lowBitRateRedundancy := d.decodeHeaderBits()
if lowBitRateRedundancy {
return nil, errUnsupportedSilkLowBitrateRedundancy
return errUnsupportedSilkLowBitrateRedundancy
}
signalType, quantizationOffsetType := d.determineFrameType(voiceActivityDetected)
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.4
d.decodeSubframeQuantizations(signalType)
gainQ16 := d.decodeSubframeQuantizations(signalType)
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.1
I1 := d.normalizeLineSpectralFrequencyStageOne(voiceActivityDetected, bandwidth)
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.2
resQ10 := d.normalizeLineSpectralFrequencyStageTwo(bandwidth, I1)
dLPC, resQ10 := d.normalizeLineSpectralFrequencyStageTwo(bandwidth, I1)
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.3
nlsfQ15 := d.normalizeLineSpectralFrequencyCoefficients(bandwidth, resQ10, I1)
nlsfQ15 := d.normalizeLineSpectralFrequencyCoefficients(dLPC, bandwidth, resQ10, I1)
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.4
d.normalizeLSFStabilization(nlsfQ15)
@ -1190,7 +1274,7 @@ func (d *Decoder) Decode(in, out []byte, isStereo bool, nanoseconds int, bandwid
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.5
n1Q15, err := d.normalizeLSFInterpolation(nlsfQ15)
if err != nil {
return nil, err
return err
}
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.6
@ -1200,22 +1284,22 @@ func (d *Decoder) Decode(in, out []byte, isStereo bool, nanoseconds int, bandwid
d.limitLPCCoefficientsRange(a32Q17)
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.5.8
d.limitLPCFilterPredictionGain(a32Q17)
aQ12 := d.limitLPCFilterPredictionGain(a32Q17)
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.1
if err := d.decodePitchLags(signalType); err != nil {
return nil, err
return err
}
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.2
if err := d.decodeLTPFilterCoefficients(signalType); err != nil {
return nil, err
return err
}
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.3
_, err = d.decodeLTPScalingParamater(signalType)
if err != nil {
return nil, err
return err
}
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.7
@ -1231,14 +1315,14 @@ func (d *Decoder) Decode(in, out []byte, isStereo bool, nanoseconds int, bandwid
pulsecounts, lsbcounts := d.decodePulseAndLSBCounts(shellblocks, rateLevel)
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.8.6
d.decodeExcitation(signalType, quantizationOffsetType, lcgSeed, pulsecounts, lsbcounts)
eQ23 := d.decodeExcitation(signalType, quantizationOffsetType, lcgSeed, pulsecounts, lsbcounts)
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.1
d.ltpSynthesis()
res := d.ltpSynthesis(signalType, eQ23)
//https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.2
d.lpcSynthesis()
d.lpcSynthesis(out, bandwidth, dLPC, aQ12, res, gainQ16)
d.isPreviousFrameVoiced = signalType == frameSignalTypeVoiced
return out, nil
return nil
}

View file

@ -28,7 +28,7 @@ func createRangeDecoder(data []byte, bitsRead uint, rangeSize uint32, highAndCod
func TestDecode20MsOnly(t *testing.T) {
d := &Decoder{}
_, err := d.Decode(testSilkFrame(), []byte{}, false, 1, BandwidthWideband)
err := d.Decode(testSilkFrame(), []float64{}, false, 1, BandwidthWideband)
if !errors.Is(err, errUnsupportedSilkFrameDuration) {
t.Fatal(err)
}
@ -36,7 +36,7 @@ func TestDecode20MsOnly(t *testing.T) {
func TestDecodeStereoTODO(t *testing.T) {
d := &Decoder{}
_, err := d.Decode(testSilkFrame(), []byte{}, true, nanoseconds20Ms, BandwidthWideband)
err := d.Decode(testSilkFrame(), []float64{}, true, nanoseconds20Ms, BandwidthWideband)
if !errors.Is(err, errUnsupportedSilkStereo) {
t.Fatal(err)
}
@ -57,16 +57,15 @@ func TestDecodeFrameType(t *testing.T) {
func TestDecodeSubframeQuantizations(t *testing.T) {
d := &Decoder{rangeDecoder: createRangeDecoder(testSilkFrame(), 31, 482344960, 437100388)}
d.decodeSubframeQuantizations(frameSignalTypeInactive)
gainQ16 := d.decodeSubframeQuantizations(frameSignalTypeInactive)
switch {
case d.subframeState[0].gain != 3.21875:
case gainQ16[0] != 210944:
t.Fatal()
case d.subframeState[1].gain != 1.71875:
case gainQ16[1] != 112640:
t.Fatal()
case d.subframeState[2].gain != 1.46875:
case gainQ16[2] != 96256:
t.Fatal()
case d.subframeState[3].gain != 1.46875:
case gainQ16[3] != 96256:
t.Fatal()
}
}
@ -83,16 +82,18 @@ func TestNormalizeLineSpectralFrequencyStageOne(t *testing.T) {
func TestNormalizeLineSpectralFrequencyStageTwo(t *testing.T) {
d := &Decoder{rangeDecoder: createRangeDecoder(testSilkFrame(), 47, 50822640, 5895957)}
resQ10 := d.normalizeLineSpectralFrequencyStageTwo(BandwidthWideband, 9)
dLPC, resQ10 := d.normalizeLineSpectralFrequencyStageTwo(BandwidthWideband, 9)
if !reflect.DeepEqual(resQ10, testResQ10()) {
t.Fatal()
} else if dLPC != 16 {
t.Fatal()
}
}
func TestNormalizeLineSpectralFrequencyCoefficients(t *testing.T) {
d := &Decoder{rangeDecoder: createRangeDecoder(testSilkFrame(), 55, 493249168, 174371199)}
nlsfQ1 := d.normalizeLineSpectralFrequencyCoefficients(BandwidthWideband, testResQ10(), 9)
nlsfQ1 := d.normalizeLineSpectralFrequencyCoefficients(16, BandwidthWideband, testResQ10(), 9)
if !reflect.DeepEqual(nlsfQ1, testNlsfQ1()) {
t.Fatal()
}

View file

@ -63,6 +63,16 @@ func clamp(low, in, high int32) int32 {
return in
}
func clampFloat(low, in, high float64) float64 {
if in > high {
return high
} else if in < low {
return low
}
return in
}
// The sign of x, i.e.,
// ( -1, x < 0
// sign(x) = < 0, x == 0