cb995e5d2b
Before would only compute s=0
1592 lines
60 KiB
Go
1592 lines
60 KiB
Go
package silk
|
|
|
|
import (
|
|
"github.com/pion/opus/internal/rangecoding"
|
|
)
|
|
|
|
// Decoder maintains the state needed to decode a stream
|
|
// of Silk frames
|
|
type Decoder struct {
|
|
rangeDecoder rangecoding.Decoder
|
|
|
|
// Have we decoded a frame yet?
|
|
haveDecoded bool
|
|
|
|
// Is the previous frame a voiced frame?
|
|
isPreviousFrameVoiced bool
|
|
|
|
previousLogGain int32
|
|
|
|
// The decoder saves the final d_LPC values, i.e., lpc[i] such that
|
|
// (j + n - d_LPC) <= i < (j + n), to feed into the LPC synthesis of the
|
|
// next subframe. This requires storage for up to 16 values of lpc[i]
|
|
// (for WB frames).
|
|
//
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.2
|
|
finalLPCValues []float64
|
|
|
|
// n0Q15 are the LSF coefficients decoded for the prior frame
|
|
// see normalizeLSFInterpolation
|
|
n0Q15 []int16
|
|
}
|
|
|
|
// NewDecoder creates a new Silk Decoder
|
|
func NewDecoder() Decoder {
|
|
return Decoder{
|
|
finalLPCValues: make([]float64, 16),
|
|
}
|
|
}
|
|
|
|
// The LP layer begins with two to eight header bits These consist of one
|
|
// Voice Activity Detection (VAD) bit per frame (up to 3), followed by a
|
|
// single flag indicating the presence of LBRR frames.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.3
|
|
func (d *Decoder) decodeHeaderBits() (voiceActivityDetected, lowBitRateRedundancy bool) {
|
|
voiceActivityDetected = d.rangeDecoder.DecodeSymbolLogP(1) == 1
|
|
lowBitRateRedundancy = d.rangeDecoder.DecodeSymbolLogP(1) == 1
|
|
return
|
|
}
|
|
|
|
// Each SILK frame contains a single "frame type" symbol that jointly
|
|
// codes the signal type and quantization offset type of the
|
|
// corresponding frame.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.3
|
|
func (d *Decoder) determineFrameType(voiceActivityDetected bool) (signalType frameSignalType, quantizationOffsetType frameQuantizationOffsetType) {
|
|
var frameTypeSymbol uint32
|
|
if voiceActivityDetected {
|
|
frameTypeSymbol = d.rangeDecoder.DecodeSymbolWithICDF(icdfFrameTypeVADActive)
|
|
} else {
|
|
frameTypeSymbol = d.rangeDecoder.DecodeSymbolWithICDF(icdfFrameTypeVADInactive)
|
|
}
|
|
|
|
// +------------+-------------+--------------------------+
|
|
// | Frame Type | Signal Type | Quantization Offset Type |
|
|
// +------------+-------------+--------------------------+
|
|
// | 0 | Inactive | Low |
|
|
// | | | |
|
|
// | 1 | Inactive | High |
|
|
// | | | |
|
|
// | 2 | Unvoiced | Low |
|
|
// | | | |
|
|
// | 3 | Unvoiced | High |
|
|
// | | | |
|
|
// | 4 | Voiced | Low |
|
|
// | | | |
|
|
// | 5 | Voiced | High |
|
|
// +------------+-------------+--------------------------+
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.3
|
|
|
|
switch {
|
|
case !voiceActivityDetected && frameTypeSymbol == 0:
|
|
signalType = frameSignalTypeInactive
|
|
quantizationOffsetType = frameQuantizationOffsetTypeLow
|
|
case !voiceActivityDetected:
|
|
signalType = frameSignalTypeInactive
|
|
quantizationOffsetType = frameQuantizationOffsetTypeHigh
|
|
case frameTypeSymbol == 0:
|
|
signalType = frameSignalTypeUnvoiced
|
|
quantizationOffsetType = frameQuantizationOffsetTypeLow
|
|
case frameTypeSymbol == 1:
|
|
signalType = frameSignalTypeUnvoiced
|
|
quantizationOffsetType = frameQuantizationOffsetTypeHigh
|
|
case frameTypeSymbol == 2:
|
|
signalType = frameSignalTypeVoiced
|
|
quantizationOffsetType = frameQuantizationOffsetTypeLow
|
|
case frameTypeSymbol == 3:
|
|
signalType = frameSignalTypeVoiced
|
|
quantizationOffsetType = frameQuantizationOffsetTypeHigh
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// A separate quantization gain is coded for each 5 ms subframe
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.4
|
|
func (d *Decoder) decodeSubframeQuantizations(signalType frameSignalType) (gainQ16 []float64) {
|
|
var logGain, deltaGainIndex, gainIndex int32
|
|
gainQ16 = make([]float64, 4)
|
|
|
|
for subframeIndex := 0; subframeIndex < subframeCount; subframeIndex++ {
|
|
|
|
//The subframe gains are either coded independently, or relative to the
|
|
// gain from the most recent coded subframe in the same channel.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.4
|
|
if subframeIndex == 0 {
|
|
// In an independently coded subframe gain, the 3 most significant bits
|
|
// of the quantization gain are decoded using a PDF selected from
|
|
// Table 11 based on the decoded signal type
|
|
switch signalType {
|
|
case frameSignalTypeInactive:
|
|
gainIndex = int32(d.rangeDecoder.DecodeSymbolWithICDF(icdfIndependentQuantizationGainMSBInactive))
|
|
case frameSignalTypeVoiced:
|
|
gainIndex = int32(d.rangeDecoder.DecodeSymbolWithICDF(icdfIndependentQuantizationGainMSBVoiced))
|
|
case frameSignalTypeUnvoiced:
|
|
gainIndex = int32(d.rangeDecoder.DecodeSymbolWithICDF(icdfIndependentQuantizationGainMSBUnvoiced))
|
|
}
|
|
|
|
// The 3 least significant bits are decoded using a uniform PDF:
|
|
// These 6 bits are combined to form a value, gain_index, between 0 and 63.
|
|
gainIndex = (gainIndex << 3) | int32(d.rangeDecoder.DecodeSymbolWithICDF(icdfIndependentQuantizationGainLSB))
|
|
|
|
// When the gain for the previous subframe is available, then the
|
|
// current gain is limited as follows:
|
|
// log_gain = max(gain_index, previous_log_gain - 16)
|
|
if d.haveDecoded {
|
|
logGain = maxInt32(gainIndex, d.previousLogGain-16)
|
|
} else {
|
|
logGain = gainIndex
|
|
}
|
|
} else {
|
|
// For subframes that do not have an independent gain (including the
|
|
// first subframe of frames not listed as using independent coding
|
|
// above), the quantization gain is coded relative to the gain from the
|
|
// previous subframe
|
|
deltaGainIndex = int32(d.rangeDecoder.DecodeSymbolWithICDF(icdfDeltaQuantizationGain))
|
|
|
|
// The following formula translates this index into a quantization gain
|
|
// for the current subframe using the gain from the previous subframe:
|
|
// log_gain = clamp(0, max(2*delta_gain_index - 16, previous_log_gain + delta_gain_index - 4), 63)
|
|
logGain = int32(clamp(0, maxInt32(2*int32(deltaGainIndex)-16, int32(d.previousLogGain+deltaGainIndex)-4), 63))
|
|
}
|
|
|
|
d.previousLogGain = logGain
|
|
|
|
// silk_gains_dequant() (gain_quant.c) dequantizes log_gain for the k'th
|
|
// subframe and converts it into a linear Q16 scale factor via
|
|
//
|
|
// gain_Q16[k] = silk_log2lin((0x1D1C71*log_gain>>16) + 2090)
|
|
//
|
|
inLogQ7 := (0x1D1C71 * int32(logGain) >> 16) + 2090
|
|
i := inLogQ7 >> 7
|
|
f := inLogQ7 & 127
|
|
|
|
// The function silk_log2lin() (log2lin.c) computes an approximation of
|
|
// 2**(inLog_Q7/128.0), where inLog_Q7 is its Q7 input. Let i =
|
|
// inLog_Q7>>7 be the integer part of inLogQ7 and f = inLog_Q7&127 be
|
|
// the fractional part. Then,
|
|
//
|
|
// (1<<i) + ((-174*f*(128-f)>>16)+f)*((1<<i)>>7)
|
|
//
|
|
// yields the approximate exponential. The final Q16 gain values lies
|
|
// between 81920 and 1686110208, inclusive (representing scale factors
|
|
// of 1.25 to 25728, respectively).
|
|
|
|
gainQ16[subframeIndex] = float64((1 << i) + ((-174*f*(128-f)>>16)+f)*((1<<i)>>7))
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// A set of normalized Line Spectral Frequency (LSF) coefficients follow
|
|
// the quantization gains in the bitstream and represent the Linear
|
|
// Predictive Coding (LPC) coefficients for the current SILK frame.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.1
|
|
func (d *Decoder) normalizeLineSpectralFrequencyStageOne(voiceActivityDetected bool, bandwidth Bandwidth) (I1 uint32) {
|
|
// The first VQ stage uses a 32-element codebook, coded with one of the
|
|
// PDFs in Table 14, depending on the audio bandwidth and the signal
|
|
// type of the current SILK frame. This yields a single index, I1, for
|
|
// the entire frame, which
|
|
//
|
|
// 1. Indexes an element in a coarse codebook,
|
|
// 2. Selects the PDFs for the second stage of the VQ, and
|
|
// 3. Selects the prediction weights used to remove intra-frame
|
|
// redundancy from the second stage.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.1
|
|
switch {
|
|
case !voiceActivityDetected && (bandwidth == BandwidthNarrowband || bandwidth == BandwidthMediumband):
|
|
I1 = d.rangeDecoder.DecodeSymbolWithICDF(icdfNormalizedLSFStageOneIndexNarrowbandOrMediumbandUnvoiced)
|
|
case voiceActivityDetected && (bandwidth == BandwidthNarrowband || bandwidth == BandwidthMediumband):
|
|
I1 = d.rangeDecoder.DecodeSymbolWithICDF(icdfNormalizedLSFStageOneIndexNarrowbandOrMediumbandVoiced)
|
|
case !voiceActivityDetected && (bandwidth == BandwidthWideband):
|
|
I1 = d.rangeDecoder.DecodeSymbolWithICDF(icdfNormalizedLSFStageOneIndexWidebandUnvoiced)
|
|
case voiceActivityDetected && (bandwidth == BandwidthWideband):
|
|
I1 = d.rangeDecoder.DecodeSymbolWithICDF(icdfNormalizedLSFStageOneIndexWidebandVoiced)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// A set of normalized Line Spectral Frequency (LSF) coefficients follow
|
|
// the quantization gains in the bitstream and represent the Linear
|
|
// Predictive Coding (LPC) coefficients for the current SILK frame.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.2
|
|
func (d *Decoder) normalizeLineSpectralFrequencyStageTwo(bandwidth Bandwidth, I1 uint32) (dLPC int, resQ10 []int16) {
|
|
// Decoding the second stage residual proceeds as follows. For each
|
|
// coefficient, the decoder reads a symbol using the PDF corresponding
|
|
// to I1 from either Table 17 or Table 18,
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.2
|
|
var codebook [][]uint
|
|
if bandwidth == BandwidthWideband {
|
|
codebook = codebookNormalizedLSFStageTwoIndexWideband
|
|
} else {
|
|
codebook = codebookNormalizedLSFStageTwoIndexNarrowbandOrMediumband
|
|
}
|
|
|
|
I2 := make([]int8, len(codebook[0]))
|
|
for i := 0; i < len(I2); i++ {
|
|
// the decoder reads a symbol using the PDF corresponding
|
|
// to I1 from either Table 17 or Table 18 and subtracts 4 from the
|
|
// result to give an index in the range -4 to 4, inclusive.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.2
|
|
I2[i] = int8(d.rangeDecoder.DecodeSymbolWithICDF(icdfNormalizedLSFStageTwoIndex[codebook[I1][i]])) - 4
|
|
|
|
// If the index is either -4 or 4, it reads a second symbol using the PDF in
|
|
// Table 19, and adds the value of this second symbol to the index,
|
|
// using the same sign. This gives the index, I2[k], a total range of
|
|
// -10 to 10, inclusive.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.2
|
|
if I2[i] == -4 {
|
|
I2[i] -= int8(d.rangeDecoder.DecodeSymbolWithICDF(icdfNormalizedLSFStageTwoIndexExtension))
|
|
} else if I2[i] == 4 {
|
|
I2[i] += int8(d.rangeDecoder.DecodeSymbolWithICDF(icdfNormalizedLSFStageTwoIndexExtension))
|
|
}
|
|
}
|
|
|
|
// The decoded indices from both stages are translated back into
|
|
// normalized LSF coefficients. The stage-2 indices represent residuals
|
|
// after both the first stage of the VQ and a separate backwards-prediction
|
|
// step. The backwards prediction process in the encoder subtracts a prediction
|
|
// from each residual formed by a multiple of the coefficient that follows it.
|
|
// The decoder must undo this process.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.2
|
|
|
|
// qstep is the Q16 quantization step size, which is 11796 for NB and MB and 9830
|
|
// for WB (representing step sizes of approximately 0.18 and 0.15, respectively).
|
|
var qstep int
|
|
if bandwidth == BandwidthWideband {
|
|
qstep = 9830
|
|
} else {
|
|
qstep = 11796
|
|
}
|
|
|
|
// stage-2 residual
|
|
resQ10 = make([]int16, len(I2))
|
|
|
|
// Let d_LPC be the order of the codebook, i.e., 10 for NB and MB, and 16 for WB
|
|
dLPC = len(I2)
|
|
|
|
// for 0 <= k < d_LPC-1
|
|
for k := dLPC - 2; k >= 0; k-- {
|
|
// The stage-2 residual for each coefficient is computed via
|
|
//
|
|
// res_Q10[k] = (k+1 < d_LPC ? (res_Q10[k+1]*pred_Q8[k])>>8 : 0) + ((((I2[k]<<10) - sign(I2[k])*102)*qstep)>>16) ,
|
|
//
|
|
|
|
// The following computes
|
|
//
|
|
// (k+1 < d_LPC ? (res_Q10[k+1]*pred_Q8[k])>>8 : 0)
|
|
//
|
|
firstOperand := int(0)
|
|
if k+1 < dLPC {
|
|
// Each coefficient selects its prediction weight from one of the two lists based on the stage-1 index, I1.
|
|
// let pred_Q8[k] be the weight for the k'th coefficient selected by this process for 0 <= k < d_LPC-1
|
|
predQ8 := int(0)
|
|
if bandwidth == BandwidthWideband {
|
|
predQ8 = int(predictionWeightForWidebandNormalizedLSF[predictionWeightSelectionForWidebandNormalizedLSF[I1][k]][k])
|
|
} else {
|
|
predQ8 = int(predictionWeightForNarrowbandAndMediumbandNormalizedLSF[predictionWeightSelectionForNarrowbandAndMediumbandNormalizedLSF[I1][k]][k])
|
|
}
|
|
|
|
firstOperand = (int(resQ10[k+1]) * predQ8) >> 8
|
|
}
|
|
|
|
// The following computes
|
|
//
|
|
// (((I2[k]<<10) - sign(I2[k])*102)*qstep)>>16
|
|
//
|
|
secondOperand := (((int(I2[k]) << 10) - sign(int(I2[k]))*102) * qstep) >> 16
|
|
|
|
resQ10[k] = int16(firstOperand + secondOperand)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// Once the stage-1 index I1 and the stage-2 residual res_Q10[] have
|
|
// been decoded, the final normalized LSF coefficients can be
|
|
// reconstructed.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.3
|
|
func (d *Decoder) normalizeLineSpectralFrequencyCoefficients(dLPC int, bandwidth Bandwidth, resQ10 []int16, I1 uint32) (nlsfQ15 []int16) {
|
|
nlsfQ15 = make([]int16, dLPC)
|
|
w2Q18 := make([]uint, dLPC)
|
|
wQ9 := make([]int16, dLPC)
|
|
|
|
cb1Q8 := codebookNormalizedLSFStageOneNarrowbandOrMediumband
|
|
if bandwidth == BandwidthWideband {
|
|
cb1Q8 = codebookNormalizedLSFStageOneWideband
|
|
}
|
|
|
|
// Let cb1_Q8[k] be the k'th entry of the stage-1 codebook vector from Table 23 or Table 24.
|
|
// Then, for 0 <= k < d_LPC, the following expression computes the
|
|
// square of the weight as a Q18 value:
|
|
//
|
|
// w2_Q18[k] = (1024/(cb1_Q8[k] - cb1_Q8[k-1])
|
|
// + 1024/(cb1_Q8[k+1] - cb1_Q8[k])) << 16
|
|
//
|
|
// where cb1_Q8[-1] = 0 and cb1_Q8[d_LPC] = 256, and the division is
|
|
// integer division. This is reduced to an unsquared, Q9 value using
|
|
// the following square-root approximation:
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.3
|
|
for k := 0; k < dLPC; k++ {
|
|
kMinusOne, kPlusOne := uint(0), uint(256)
|
|
if k != 0 {
|
|
kMinusOne = cb1Q8[I1][k-1]
|
|
}
|
|
|
|
if k+1 != dLPC {
|
|
kPlusOne = cb1Q8[I1][k+1]
|
|
}
|
|
|
|
w2Q18[k] = (1024/(cb1Q8[I1][k]-kMinusOne) +
|
|
1024/(kPlusOne-cb1Q8[I1][k])) << 16
|
|
|
|
// This is reduced to an unsquared, Q9 value using
|
|
// the following square-root approximation:
|
|
//
|
|
// i = ilog(w2_Q18[k])
|
|
// f = (w2_Q18[k]>>(i-8)) & 127
|
|
// y = ((i&1) ? 32768 : 46214) >> ((32-i)>>1)
|
|
// w_Q9[k] = y + ((213*f*y)>>16)
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.3
|
|
i := ilog(int(w2Q18[k]))
|
|
f := int((w2Q18[k] >> (i - 8)) & 127)
|
|
|
|
y := 46214
|
|
if (i & 1) != 0 {
|
|
y = 32768
|
|
}
|
|
|
|
y = y >> ((32 - i) >> 1)
|
|
wQ9[k] = int16(y + ((213 * f * y) >> 16))
|
|
|
|
// Given the stage-1 codebook entry cb1_Q8[], the stage-2 residual
|
|
// res_Q10[], and their corresponding weights, w_Q9[], the reconstructed
|
|
// normalized LSF coefficients are
|
|
//
|
|
// NLSF_Q15[k] = clamp(0,
|
|
// (cb1_Q8[k]<<7) + (res_Q10[k]<<14)/w_Q9[k], 32767)
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.3
|
|
nlsfQ15[k] = int16(clamp(0,
|
|
int32((int(cb1Q8[I1][k])<<7)+(int(resQ10[k])<<14)/int(wQ9[k])), 32767))
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// The normalized LSF stabilization procedure ensures that
|
|
// consecutive values of the normalized LSF coefficients, NLSF_Q15[],
|
|
// are spaced some minimum distance apart (predetermined to be the 0.01
|
|
// percentile of a large training set).
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.4
|
|
func (d *Decoder) normalizeLSFStabilization(nlsfQ15 []int16) {
|
|
// TODO
|
|
}
|
|
|
|
// For 20 ms SILK frames, the first half of the frame (i.e., the first
|
|
// two subframes) may use normalized LSF coefficients that are
|
|
// interpolated between the decoded LSFs for the most recent coded frame
|
|
// (in the same channel) and the current frame
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.5
|
|
func (d *Decoder) normalizeLSFInterpolation(n2Q15 []int16) (n1Q15 []int16, wQ2 int16) {
|
|
// Let n2_Q15[k] be the normalized LSF coefficients decoded by the
|
|
// procedure in Section 4.2.7.5, n0_Q15[k] be the LSF coefficients
|
|
// decoded for the prior frame, and w_Q2 be the interpolation factor.
|
|
// Then, the normalized LSF coefficients used for the first half of a
|
|
// 20 ms frame, n1_Q15[k], are
|
|
//
|
|
// n1_Q15[k] = n0_Q15[k] + (w_Q2*(n2_Q15[k] - n0_Q15[k]) >> 2)
|
|
wQ2 = int16(d.rangeDecoder.DecodeSymbolWithICDF(icdfNormalizedLSFInterpolationIndex))
|
|
if wQ2 == 4 || !d.haveDecoded {
|
|
return n2Q15, wQ2
|
|
}
|
|
|
|
n1Q15 = make([]int16, len(n2Q15))
|
|
for k := range n1Q15 {
|
|
n1Q15[k] = d.n0Q15[k] + (wQ2 * (n2Q15[k] - d.n0Q15[k]) >> 2)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func (d *Decoder) convertNormalizedLSFsToLPCCoefficients(n1Q15 []int16, bandwidth Bandwidth) (a32Q17 []int32) {
|
|
cQ17 := make([]int32, len(n1Q15))
|
|
cosQ12 := q12CosineTableForLSFConverion
|
|
|
|
ordering := lsfOrderingForPolynomialEvaluationNarrowbandAndMediumband
|
|
if bandwidth == BandwidthWideband {
|
|
ordering = lsfOrderingForPolynomialEvaluationWideband
|
|
}
|
|
|
|
// The top 7 bits of each normalized LSF coefficient index a value in
|
|
// the table, and the next 8 bits interpolate between it and the next
|
|
// value. Let i = (n[k] >> 8) be the integer index and f = (n[k] & 255)
|
|
// be the fractional part of a given coefficient. Then, the re-ordered,
|
|
// approximated cosine, c_Q17[ordering[k]], is
|
|
//
|
|
// c_Q17[ordering[k]] = (cos_Q12[i]*256
|
|
// + (cos_Q12[i+1]-cos_Q12[i])*f + 4) >> 3
|
|
//
|
|
// where ordering[k] is the k'th entry of the column of Table 27
|
|
// corresponding to the current audio bandwidth and cos_Q12[i] is the
|
|
// i'th entry of Table 28.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.6
|
|
for k := range n1Q15 {
|
|
i := int32(n1Q15[k] >> 8)
|
|
f := int32(n1Q15[k] & 255)
|
|
|
|
cQ17[ordering[k]] = (cosQ12[i]*256 +
|
|
(cosQ12[i+1]-cosQ12[i])*f + 4) >> 3
|
|
}
|
|
|
|
pQ16 := make([]int32, (len(n1Q15)/2)+1)
|
|
qQ16 := make([]int32, (len(n1Q15)/2)+1)
|
|
|
|
// Given the list of cosine values compute the coefficients of P and Q,
|
|
// described here via a simple recurrence. Let p_Q16[k][j] and q_Q16[k][j]
|
|
// be the coefficients of the products of the first (k+1) root pairs for P and
|
|
// Q, with j indexing the coefficient number. Only the first (k+2) coefficients
|
|
// are needed, as the products are symmetric. Let
|
|
//
|
|
// p_Q16[0][0] = q_Q16[0][0] = 1<<16
|
|
// p_Q16[0][1] = -c_Q17[0]
|
|
// q_Q16[0][1] = -c_Q17[1]
|
|
// d2 = d_LPC/2
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.6
|
|
|
|
pQ16[0] = 1 << 16
|
|
qQ16[0] = 1 << 16
|
|
pQ16[1] = -cQ17[0]
|
|
qQ16[1] = -cQ17[1]
|
|
dLPC := len(n1Q15)
|
|
d2 := dLPC / 2
|
|
|
|
// As boundary conditions, assume p_Q16[k][j] = q_Q16[k][j] = 0 for all j < 0.
|
|
// Also, assume (because of the symmetry)
|
|
//
|
|
// p_Q16[k][k+2] = p_Q16[k][k]
|
|
// q_Q16[k][k+2] = q_Q16[k][k]
|
|
//
|
|
// Then, for 0 < k < d2 and 0 <= j <= k+1,
|
|
//
|
|
// p_Q16[k][j] = p_Q16[k-1][j] + p_Q16[k-1][j-2]
|
|
// - ((c_Q17[2*k]*p_Q16[k-1][j-1] + 32768)>>16)
|
|
//
|
|
// q_Q16[k][j] = q_Q16[k-1][j] + q_Q16[k-1][j-2]
|
|
// - ((c_Q17[2*k+1]*q_Q16[k-1][j-1] + 32768)>>16)
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.6
|
|
|
|
for k := 1; k < d2; k++ {
|
|
pQ16[k+1] = pQ16[k-1]*2 - int32(((int64(cQ17[2*k])*int64(pQ16[k]))+32768)>>16)
|
|
qQ16[k+1] = qQ16[k-1]*2 - int32(((int64(cQ17[(2*k)+1])*int64(qQ16[k]))+32768)>>16)
|
|
|
|
for j := k; j > 1; j-- {
|
|
pQ16[j] += pQ16[j-2] - int32(((int64(cQ17[2*k])*int64(pQ16[j-1]))+32768)>>16)
|
|
qQ16[j] += qQ16[j-2] - int32(((int64(cQ17[(2*k)+1])*int64(qQ16[j-1]))+32768)>>16)
|
|
}
|
|
|
|
pQ16[1] -= cQ17[2*k]
|
|
qQ16[1] -= cQ17[2*k+1]
|
|
}
|
|
|
|
// silk_NLSF2A() uses the values from the last row of this recurrence to
|
|
// reconstruct a 32-bit version of the LPC filter (without the leading
|
|
// 1.0 coefficient), a32_Q17[k], 0 <= k < d2:
|
|
//
|
|
// a32_Q17[k] = -(q_Q16[d2-1][k+1] - q_Q16[d2-1][k])
|
|
// - (p_Q16[d2-1][k+1] + p_Q16[d2-1][k]))
|
|
//
|
|
// a32_Q17[d_LPC-k-1] = (q_Q16[d2-1][k+1] - q_Q16[d2-1][k])
|
|
// - (p_Q16[d2-1][k+1] + p_Q16[d2-1][k]))
|
|
//
|
|
// The sum and difference of two terms from each of the p_Q16 and q_Q16
|
|
// coefficient lists reflect the (1 + z**-1) and (1 - z**-1) factors of
|
|
// P and Q, respectively. The promotion of the expression from Q16 to
|
|
// Q17 implicitly scales the result by 1/2.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.6
|
|
|
|
a32Q17 = make([]int32, len(n1Q15))
|
|
for k := 0; k < d2; k++ {
|
|
a32Q17[k] = -(qQ16[k+1] - qQ16[k]) - (pQ16[k+1] + pQ16[k])
|
|
a32Q17[dLPC-k-1] = (qQ16[k+1] - qQ16[k]) - (pQ16[k+1] + pQ16[k])
|
|
}
|
|
return
|
|
}
|
|
|
|
// As described in Section 4.2.7.8.6, SILK uses a Linear Congruential
|
|
// Generator (LCG) to inject pseudorandom noise into the quantized
|
|
// excitation. To ensure synchronization of this process between the
|
|
// encoder and decoder, each SILK frame stores a 2-bit seed after the
|
|
// LTP parameters (if any).
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.7
|
|
func (d *Decoder) decodeLinearCongruentialGeneratorSeed() uint32 {
|
|
return d.rangeDecoder.DecodeSymbolWithICDF(icdfLinearCongruentialGeneratorSeed)
|
|
}
|
|
|
|
// SILK fixes the dimension of the codebook to N = 16. The excitation
|
|
// is made up of a number of "shell blocks", each 16 samples in size.
|
|
// Table 44 lists the number of shell blocks required for a SILK frame
|
|
// for each possible audio bandwidth and frame size.
|
|
//
|
|
// +-----------------+------------+------------------------+
|
|
// | Audio Bandwidth | Frame Size | Number of Shell Blocks |
|
|
// +-----------------+------------+------------------------+
|
|
// | NB | 10 ms | 5 |
|
|
// | | | |
|
|
// | MB | 10 ms | 8 |
|
|
// | | | |
|
|
// | WB | 10 ms | 10 |
|
|
// | | | |
|
|
// | NB | 20 ms | 10 |
|
|
// | | | |
|
|
// | MB | 20 ms | 15 |
|
|
// | | | |
|
|
// | WB | 20 ms | 20 |
|
|
// +-----------------+------------+------------------------+
|
|
//
|
|
// Table 44: Number of Shell Blocks Per SILK Frame
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.8
|
|
func (d *Decoder) decodeShellblocks(nanoseconds int, bandwidth Bandwidth) (shellblocks int) {
|
|
switch {
|
|
case bandwidth == BandwidthNarrowband && nanoseconds == nanoseconds10Ms:
|
|
shellblocks = 5
|
|
case bandwidth == BandwidthMediumband && nanoseconds == nanoseconds10Ms:
|
|
shellblocks = 8
|
|
case bandwidth == BandwidthWideband && nanoseconds == nanoseconds10Ms:
|
|
fallthrough
|
|
case bandwidth == BandwidthNarrowband && nanoseconds == nanoseconds20Ms:
|
|
shellblocks = 10
|
|
case bandwidth == BandwidthMediumband && nanoseconds == nanoseconds20Ms:
|
|
shellblocks = 15
|
|
case bandwidth == BandwidthWideband && nanoseconds == nanoseconds20Ms:
|
|
shellblocks = 20
|
|
}
|
|
return
|
|
}
|
|
|
|
// The first symbol in the excitation is a "rate level", which is an
|
|
// index from 0 to 8, inclusive, coded using the PDF in Table 45
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.8.1
|
|
func (d *Decoder) decodeRatelevel(voiceActivityDetected bool) uint32 {
|
|
if voiceActivityDetected {
|
|
return d.rangeDecoder.DecodeSymbolWithICDF(icdfRateLevelVoiced)
|
|
}
|
|
|
|
return d.rangeDecoder.DecodeSymbolWithICDF(icdfRateLevelUnvoiced)
|
|
}
|
|
|
|
// The total number of pulses in each of the shell blocks follows the
|
|
// rate level. The pulse counts for all of the shell blocks are coded
|
|
// consecutively, before the content of any of the blocks.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.8.2
|
|
func (d *Decoder) decodePulseAndLSBCounts(shellblocks int, rateLevel uint32) (pulsecounts []uint8, lsbcounts []uint8) {
|
|
pulsecounts = make([]uint8, shellblocks)
|
|
lsbcounts = make([]uint8, shellblocks)
|
|
for i := 0; i < shellblocks; i++ {
|
|
pulsecounts[i] = uint8(d.rangeDecoder.DecodeSymbolWithICDF(icdfPulseCount[rateLevel]))
|
|
|
|
// The special value 17 indicates that this block
|
|
// has one or more additional LSBs to decode for each coefficient.
|
|
if pulsecounts[i] == 17 {
|
|
// If the decoder encounters this value, it decodes another value for the
|
|
// actual pulse count of the block, but uses the PDF corresponding to
|
|
// the special rate level 9 instead of the normal rate level.
|
|
// This Process repeats until the decoder reads a value less than 17, and it
|
|
// Then sets the number of extra LSBs used to the number of 17's decoded
|
|
// For that block.
|
|
lsbcount := uint8(0)
|
|
for ; pulsecounts[i] == 17 && lsbcount < 10; lsbcount++ {
|
|
pulsecounts[i] = uint8(d.rangeDecoder.DecodeSymbolWithICDF(icdfPulseCount[9]))
|
|
}
|
|
lsbcounts[i] = lsbcount
|
|
|
|
// If it reads the value 17 ten times, then the next
|
|
// Iteration uses the special rate level 10 instead of 9. The
|
|
// Probability of decoding a 17 when using the PDF for rate level 10 is
|
|
// Zero, ensuring that the number of LSBs for a block will not exceed
|
|
// 10. The cumulative distribution for rate level 10 is just a shifted
|
|
// Version of that for 9 and thus does not require any additional
|
|
// Storage.
|
|
if lsbcount == 10 {
|
|
pulsecounts[i] = uint8(d.rangeDecoder.DecodeSymbolWithICDF(icdfPulseCount[10]))
|
|
}
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// The locations of the pulses in each shell block follow the pulse
|
|
// counts. As with the pulse counts, these locations are coded for all the shell blocks
|
|
// before any of the remaining information for each block. Unlike many
|
|
// other codecs, SILK places no restriction on the distribution of
|
|
// pulses within a shell block. All of the pulses may be placed in a
|
|
// single location, or each one in a unique location, or anything in
|
|
// between.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.8.3
|
|
func (d *Decoder) decodePulseLocation(pulsecounts []uint8) (eRaw []int32) {
|
|
eRaw = make([]int32, len(pulsecounts)*pulsecountLargestPartitionSize)
|
|
for i := range pulsecounts {
|
|
// This process skips partitions without any pulses, i.e., where
|
|
// the initial pulse count from Section 4.2.7.8.2 was zero, or where the
|
|
// split in the prior level indicated that all of the pulses fell on the
|
|
// other side. These partitions have nothing to code, so they require
|
|
// no PDF.
|
|
if pulsecounts[i] == 0 {
|
|
continue
|
|
}
|
|
|
|
eRawIndex := pulsecountLargestPartitionSize * i
|
|
samplePartition16 := make([]uint8, 2)
|
|
samplePartition8 := make([]uint8, 2)
|
|
samplePartition4 := make([]uint8, 2)
|
|
samplePartition2 := make([]uint8, 2)
|
|
|
|
// The location of pulses is coded by recursively partitioning each
|
|
// block into halves, and coding how many pulses fall on the left side
|
|
// of the split. All remaining pulses must fall on the right side of
|
|
// the split.
|
|
d.partitionPulseCount(icdfPulseCountSplit16SamplePartitions, pulsecounts[i], samplePartition16)
|
|
for j := 0; j < 2; j++ {
|
|
d.partitionPulseCount(icdfPulseCountSplit8SamplePartitions, samplePartition16[j], samplePartition8)
|
|
for k := 0; k < 2; k++ {
|
|
d.partitionPulseCount(icdfPulseCountSplit4SamplePartitions, samplePartition8[k], samplePartition4)
|
|
for l := 0; l < 2; l++ {
|
|
d.partitionPulseCount(icdfPulseCountSplit2SamplePartitions, samplePartition4[l], samplePartition2)
|
|
eRaw[eRawIndex] = int32(samplePartition2[0])
|
|
eRawIndex++
|
|
|
|
eRaw[eRawIndex] = int32(samplePartition2[1])
|
|
eRawIndex++
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// After the decoder reads the pulse locations for all blocks, it reads
|
|
// the LSBs (if any) for each block in turn. Inside each block, it
|
|
// reads all the LSBs for each coefficient in turn, even those where no
|
|
// pulses were allocated, before proceeding to the next one.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.8.4
|
|
func (d *Decoder) decodeExcitationLSB(eRaw []int32, lsbcounts []uint8) {
|
|
for i := 0; i < len(eRaw); i++ {
|
|
for bit := uint8(0); bit < lsbcounts[i/pulsecountLargestPartitionSize]; bit++ {
|
|
eRaw[i] = (eRaw[i] << 1) | int32(d.rangeDecoder.DecodeSymbolWithICDF(icdfExcitationLSB))
|
|
}
|
|
}
|
|
}
|
|
|
|
// After decoding the pulse locations and the LSBs, the decoder knows
|
|
// the magnitude of each coefficient in the excitation.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.8.5
|
|
func (d *Decoder) decodeExcitationSign(eRaw []int32, signalType frameSignalType, quantizationOffsetType frameQuantizationOffsetType, pulsecounts []uint8) {
|
|
for i := 0; i < len(eRaw); i++ {
|
|
// It then decodes a sign for all coefficients
|
|
// with a non-zero magnitude
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.8.5
|
|
if eRaw[i] == 0 {
|
|
continue
|
|
}
|
|
|
|
var icdf []uint
|
|
pulsecount := pulsecounts[i/pulsecountLargestPartitionSize]
|
|
|
|
// using one of the PDFs from Table 52.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.8.5
|
|
switch signalType {
|
|
case frameSignalTypeInactive:
|
|
switch quantizationOffsetType {
|
|
case frameQuantizationOffsetTypeLow:
|
|
switch pulsecount {
|
|
case 0:
|
|
icdf = icdfExcitationSignInactiveSignalLowQuantization0Pulse
|
|
case 1:
|
|
icdf = icdfExcitationSignInactiveSignalLowQuantization1Pulse
|
|
case 2:
|
|
icdf = icdfExcitationSignInactiveSignalLowQuantization2Pulse
|
|
case 3:
|
|
icdf = icdfExcitationSignInactiveSignalLowQuantization3Pulse
|
|
case 4:
|
|
icdf = icdfExcitationSignInactiveSignalLowQuantization4Pulse
|
|
case 5:
|
|
icdf = icdfExcitationSignInactiveSignalLowQuantization5Pulse
|
|
default:
|
|
icdf = icdfExcitationSignInactiveSignalLowQuantization6PlusPulse
|
|
}
|
|
case frameQuantizationOffsetTypeHigh:
|
|
switch pulsecount {
|
|
case 0:
|
|
icdf = icdfExcitationSignInactiveSignalHighQuantization0Pulse
|
|
case 1:
|
|
icdf = icdfExcitationSignInactiveSignalHighQuantization1Pulse
|
|
case 2:
|
|
icdf = icdfExcitationSignInactiveSignalHighQuantization2Pulse
|
|
case 3:
|
|
icdf = icdfExcitationSignInactiveSignalHighQuantization3Pulse
|
|
case 4:
|
|
icdf = icdfExcitationSignInactiveSignalHighQuantization4Pulse
|
|
case 5:
|
|
icdf = icdfExcitationSignInactiveSignalHighQuantization5Pulse
|
|
default:
|
|
icdf = icdfExcitationSignInactiveSignalHighQuantization6PlusPulse
|
|
}
|
|
|
|
}
|
|
case frameSignalTypeUnvoiced:
|
|
switch quantizationOffsetType {
|
|
case frameQuantizationOffsetTypeLow:
|
|
switch pulsecount {
|
|
case 0:
|
|
icdf = icdfExcitationSignUnvoicedSignalLowQuantization0Pulse
|
|
case 1:
|
|
icdf = icdfExcitationSignUnvoicedSignalLowQuantization1Pulse
|
|
case 2:
|
|
icdf = icdfExcitationSignUnvoicedSignalLowQuantization2Pulse
|
|
case 3:
|
|
icdf = icdfExcitationSignUnvoicedSignalLowQuantization3Pulse
|
|
case 4:
|
|
icdf = icdfExcitationSignUnvoicedSignalLowQuantization4Pulse
|
|
case 5:
|
|
icdf = icdfExcitationSignUnvoicedSignalLowQuantization5Pulse
|
|
default:
|
|
icdf = icdfExcitationSignUnvoicedSignalLowQuantization6PlusPulse
|
|
}
|
|
case frameQuantizationOffsetTypeHigh:
|
|
switch pulsecount {
|
|
case 0:
|
|
icdf = icdfExcitationSignUnvoicedSignalHighQuantization0Pulse
|
|
case 1:
|
|
icdf = icdfExcitationSignUnvoicedSignalHighQuantization1Pulse
|
|
case 2:
|
|
icdf = icdfExcitationSignUnvoicedSignalHighQuantization2Pulse
|
|
case 3:
|
|
icdf = icdfExcitationSignUnvoicedSignalHighQuantization3Pulse
|
|
case 4:
|
|
icdf = icdfExcitationSignUnvoicedSignalHighQuantization4Pulse
|
|
case 5:
|
|
icdf = icdfExcitationSignUnvoicedSignalHighQuantization5Pulse
|
|
default:
|
|
icdf = icdfExcitationSignUnvoicedSignalHighQuantization6PlusPulse
|
|
}
|
|
|
|
}
|
|
|
|
case frameSignalTypeVoiced:
|
|
switch quantizationOffsetType {
|
|
case frameQuantizationOffsetTypeLow:
|
|
switch pulsecount {
|
|
case 0:
|
|
icdf = icdfExcitationSignVoicedSignalLowQuantization0Pulse
|
|
case 1:
|
|
icdf = icdfExcitationSignVoicedSignalLowQuantization1Pulse
|
|
case 2:
|
|
icdf = icdfExcitationSignVoicedSignalLowQuantization2Pulse
|
|
case 3:
|
|
icdf = icdfExcitationSignVoicedSignalLowQuantization3Pulse
|
|
case 4:
|
|
icdf = icdfExcitationSignVoicedSignalLowQuantization4Pulse
|
|
case 5:
|
|
icdf = icdfExcitationSignVoicedSignalLowQuantization5Pulse
|
|
default:
|
|
icdf = icdfExcitationSignVoicedSignalLowQuantization6PlusPulse
|
|
}
|
|
case frameQuantizationOffsetTypeHigh:
|
|
switch pulsecount {
|
|
case 0:
|
|
icdf = icdfExcitationSignVoicedSignalHighQuantization0Pulse
|
|
case 1:
|
|
icdf = icdfExcitationSignVoicedSignalHighQuantization1Pulse
|
|
case 2:
|
|
icdf = icdfExcitationSignVoicedSignalHighQuantization2Pulse
|
|
case 3:
|
|
icdf = icdfExcitationSignVoicedSignalHighQuantization3Pulse
|
|
case 4:
|
|
icdf = icdfExcitationSignVoicedSignalHighQuantization4Pulse
|
|
case 5:
|
|
icdf = icdfExcitationSignVoicedSignalHighQuantization5Pulse
|
|
default:
|
|
icdf = icdfExcitationSignVoicedSignalHighQuantization6PlusPulse
|
|
}
|
|
}
|
|
}
|
|
|
|
// If the value decoded is 0, then the coefficient magnitude is negated.
|
|
// Otherwise, it remains positive.
|
|
if d.rangeDecoder.DecodeSymbolWithICDF(icdf) == 0 {
|
|
eRaw[i] *= -1
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
// SILK codes the excitation using a modified version of the Pyramid
|
|
// Vector Quantizer (PVQ) codebook [PVQ]. The PVQ codebook is designed
|
|
// for Laplace-distributed values and consists of all sums of K signed,
|
|
// unit pulses in a vector of dimension N, where two pulses at the same
|
|
// position are required to have the same sign.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.8
|
|
func (d *Decoder) decodeExcitation(signalType frameSignalType, quantizationOffsetType frameQuantizationOffsetType, seed uint32, pulsecounts, lsbcounts []uint8) (eQ23 []int32) {
|
|
// After the signs have been read, there is enough information to
|
|
// reconstruct the complete excitation signal. This requires adding a
|
|
// constant quantization offset to each non-zero sample and then
|
|
// pseudorandomly inverting and offsetting every sample.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.8.6
|
|
|
|
// The constant quantization offset varies depending on the signal type and
|
|
// quantization offset type
|
|
|
|
// +-------------+--------------------------+--------------------------+
|
|
// | Signal Type | Quantization Offset Type | Quantization Offset |
|
|
// | | | (Q23) |
|
|
// +-------------+--------------------------+--------------------------+
|
|
// | Inactive | Low | 25 |
|
|
// | | | |
|
|
// | Inactive | High | 60 |
|
|
// | | | |
|
|
// | Unvoiced | Low | 25 |
|
|
// | | | |
|
|
// | Unvoiced | High | 60 |
|
|
// | | | |
|
|
// | Voiced | Low | 8 |
|
|
// | | | |
|
|
// | Voiced | High | 25 |
|
|
// +-------------+--------------------------+--------------------------+
|
|
// Table 53: Excitation Quantization Offsets
|
|
var offsetQ23 int32
|
|
switch {
|
|
case signalType == frameSignalTypeInactive && quantizationOffsetType == frameQuantizationOffsetTypeLow:
|
|
offsetQ23 = 25
|
|
case signalType == frameSignalTypeInactive && quantizationOffsetType == frameQuantizationOffsetTypeHigh:
|
|
offsetQ23 = 60
|
|
case signalType == frameSignalTypeUnvoiced && quantizationOffsetType == frameQuantizationOffsetTypeLow:
|
|
offsetQ23 = 25
|
|
case signalType == frameSignalTypeUnvoiced && quantizationOffsetType == frameQuantizationOffsetTypeHigh:
|
|
offsetQ23 = 25
|
|
case signalType == frameSignalTypeVoiced && quantizationOffsetType == frameQuantizationOffsetTypeLow:
|
|
offsetQ23 = 8
|
|
case signalType == frameSignalTypeVoiced && quantizationOffsetType == frameQuantizationOffsetTypeHigh:
|
|
offsetQ23 = 25
|
|
}
|
|
|
|
// Let e_raw[i] be the raw excitation value at position i,
|
|
// with a magnitude composed of the pulses at that location (see Section 4.2.7.8.3)
|
|
eRaw := d.decodePulseLocation(pulsecounts)
|
|
|
|
// combined with any additional LSBs (see Section 4.2.7.8.4),
|
|
d.decodeExcitationLSB(eRaw, lsbcounts)
|
|
|
|
// and with the corresponding sign decoded in Section 4.2.7.8.5.
|
|
d.decodeExcitationSign(eRaw, signalType, quantizationOffsetType, pulsecounts)
|
|
|
|
eQ23 = make([]int32, len(eRaw))
|
|
for i := 0; i < len(eRaw); i++ {
|
|
// Additionally, let seed be the current pseudorandom seed, which is initialized to the
|
|
// value decoded from Section 4.2.7.7 for the first sample in the current SILK frame, and
|
|
// updated for each subsequent sample according to the procedure below.
|
|
// Finally, let offset_Q23 be the quantization offset from Table 53.
|
|
// Then the following procedure produces the final reconstructed
|
|
// excitation value, e_Q23[i]:
|
|
|
|
// e_Q23[i] = (e_raw[i] << 8) - sign(e_raw[i])*20 + offset_Q23;
|
|
// seed = (196314165*seed + 907633515) & 0xFFFFFFFF;
|
|
// e_Q23[i] = (seed & 0x80000000) ? -e_Q23[i] : e_Q23[i];
|
|
// seed = (seed + e_raw[i]) & 0xFFFFFFFF;
|
|
|
|
// When e_raw[i] is zero, sign() returns 0 by the definition in
|
|
// Section 1.1.4, so the factor of 20 does not get added. The final
|
|
// e_Q23[i] value may require more than 16 bits per sample, but it will
|
|
// not require more than 23, including the sign.
|
|
|
|
eQ23[i] = (eRaw[i] << 8) - int32(sign(int(eRaw[i])))*20 + offsetQ23
|
|
seed = (196314165*seed + 907633515) & 0xFFFFFFFF
|
|
if seed&0x80000000 != 0 {
|
|
eQ23[i] *= -1
|
|
}
|
|
seed = (seed + uint32(eRaw[i])) & 0xFFFFFFFF
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// The PDF to use is chosen by the size of the current partition (16, 8, 4, or 2) and the
|
|
// number of pulses in the partition (1 to 16, inclusive)
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.8.3
|
|
func (d *Decoder) partitionPulseCount(icdf [][]uint, block uint8, halves []uint8) {
|
|
// This process skips partitions without any pulses, i.e., where
|
|
// the initial pulse count from Section 4.2.7.8.2 was zero, or where the
|
|
// split in the prior level indicated that all of the pulses fell on the
|
|
// other side. These partitions have nothing to code, so they require
|
|
// no PDF.
|
|
if block == 0 {
|
|
halves[0] = 0
|
|
halves[1] = 0
|
|
} else {
|
|
halves[0] = uint8(d.rangeDecoder.DecodeSymbolWithICDF(icdf[block-1]))
|
|
halves[1] = block - halves[0]
|
|
}
|
|
}
|
|
|
|
// The a32_Q17[] coefficients are too large to fit in a 16-bit value,
|
|
// which significantly increases the cost of applying this filter in
|
|
// fixed-point decoders. Reducing them to Q12 precision doesn't incur
|
|
// any significant quality loss, but still does not guarantee they will
|
|
// fit.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.7
|
|
func (d *Decoder) limitLPCCoefficientsRange(a32Q17 []int32) {
|
|
bandwidthExpansionRound := 0
|
|
for ; bandwidthExpansionRound < 10; bandwidthExpansionRound++ {
|
|
|
|
// For each round, the process first finds the index k such that
|
|
// abs(a32_Q17[k]) is largest, breaking ties by choosing the lowest
|
|
// value of k.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.7
|
|
maxabsQ17K := uint(0)
|
|
maxabsQ17 := uint(0)
|
|
|
|
for k, val := range a32Q17 {
|
|
abs := int32(sign(int(val))) * val
|
|
if maxabsQ17 < uint(abs) {
|
|
maxabsQ17K = uint(k)
|
|
maxabsQ17 = uint(abs)
|
|
}
|
|
}
|
|
|
|
// Then, it computes the corresponding Q12 precision value,
|
|
// maxabs_Q12, subject to an upper bound to avoid overflow in subsequent
|
|
// computations:
|
|
//
|
|
// maxabs_Q12 = min((maxabs_Q17 + 16) >> 5, 163838)
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.7
|
|
|
|
maxabsQ12 := minUint((maxabsQ17+16)>>5, 163838)
|
|
|
|
// If this is larger than 32767, the procedure derives the chirp factor,
|
|
// sc_Q16[0], to use in the bandwidth expansion as
|
|
//
|
|
// (maxabs_Q12 - 32767) << 14
|
|
// sc_Q16[0] = 65470 - --------------------------
|
|
// (maxabs_Q12 * (k+1)) >> 2
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.7
|
|
if maxabsQ12 > 32767 {
|
|
scQ16 := make([]uint, len(a32Q17))
|
|
|
|
scQ16[0] = uint(65470)
|
|
scQ16[0] -= ((maxabsQ12 - 32767) << 14) / ((maxabsQ12 * (maxabsQ17K + 1)) >> 2)
|
|
|
|
// silk_bwexpander_32() (bwexpander_32.c) performs the bandwidth
|
|
// expansion (again, only when maxabs_Q12 is greater than 32767) using
|
|
// the following recurrence:
|
|
//
|
|
// a32_Q17[k] = (a32_Q17[k]*sc_Q16[k]) >> 16
|
|
//
|
|
// sc_Q16[k+1] = (sc_Q16[0]*sc_Q16[k] + 32768) >> 16
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.7
|
|
for k := 0; k < len(a32Q17); k++ {
|
|
a32Q17[k] = (a32Q17[k] * int32(scQ16[k])) >> 16
|
|
if len(scQ16) <= k {
|
|
scQ16[k+1] = (scQ16[0]*scQ16[k] + 32768) >> 16
|
|
}
|
|
}
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
|
|
// After 10 rounds of bandwidth expansion are performed, they are simply
|
|
// saturated to 16 bits:
|
|
//
|
|
// a32_Q17[k] = clamp(-32768, (a32_Q17[k] + 16) >> 5, 32767) << 5
|
|
//
|
|
// Because this performs the actual saturation in the Q12 domain, but
|
|
// saturation is not performed if maxabs_Q12 drops to 32767 or less
|
|
// prior to the 10th round.
|
|
if bandwidthExpansionRound == 9 {
|
|
for k := 0; k < len(a32Q17); k++ {
|
|
a32Q17[k] = clamp(-32768, (a32Q17[k]+16)>>5, 32767) << 5
|
|
}
|
|
}
|
|
}
|
|
|
|
// The prediction gain of an LPC synthesis filter is the square root of
|
|
// the output energy when the filter is excited by a unit-energy
|
|
// impulse. Even if the Q12 coefficients would fit, the resulting
|
|
// filter may still have a significant gain (especially for voiced
|
|
// sounds), making the filter unstable. silk_NLSF2A() applies up to 16
|
|
// additional rounds of bandwidth expansion to limit the prediction
|
|
// gain.
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.8
|
|
func (d *Decoder) limitLPCFilterPredictionGain(a32Q17 []int32) (aQ12 []float64) {
|
|
aQ12 = make([]float64, len(a32Q17))
|
|
|
|
// However, silk_LPC_inverse_pred_gain_QA() approximates this using
|
|
// fixed-point arithmetic to guarantee reproducible results across
|
|
// platforms and implementations. Since small changes in the
|
|
// coefficients can make a stable filter unstable, it takes the real Q12
|
|
// coefficients that will be used during reconstruction as input. Thus,
|
|
// let
|
|
//
|
|
// a32_Q12[n] = (a32_Q17[n] + 16) >> 5
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.8
|
|
for n := range a32Q17 {
|
|
aQ12[n] = float64((a32Q17[n] + 16) >> 5)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.1
|
|
func (d *Decoder) decodePitchLags(signalType frameSignalType, bandwidth Bandwidth) (lag uint32, pitchLags []int) {
|
|
if signalType != frameSignalTypeVoiced {
|
|
return
|
|
}
|
|
|
|
var (
|
|
lagMin uint32
|
|
lagMax uint32
|
|
)
|
|
|
|
// The primary lag index is coded either relative to the primary lag of
|
|
// the prior frame in the same channel or as an absolute index.
|
|
// Absolute coding is used if and only if
|
|
//
|
|
// * This is the first SILK frame of its type (LBRR or regular) for
|
|
// this channel in the current Opus frame,
|
|
//
|
|
// * The previous SILK frame of the same type (LBRR or regular) for
|
|
// this channel in the same Opus frame was not coded, or
|
|
//
|
|
// * That previous SILK frame was coded, but was not voiced (see
|
|
// Section 4.2.7.3).
|
|
|
|
lagAbsolute := true
|
|
if lagAbsolute {
|
|
// With absolute coding, the primary pitch lag may range from 2 ms
|
|
// (inclusive) up to 18 ms (exclusive), corresponding to pitches from
|
|
// 500 Hz down to 55.6 Hz, respectively. It is comprised of a high part
|
|
// and a low part, where the decoder first reads the high part using the
|
|
// 32-entry codebook in Table 29 and then the low part using the
|
|
// codebook corresponding to the current audio bandwidth from Table 30.
|
|
//
|
|
// +------------+------------------------+-------+----------+----------+
|
|
// | Audio | PDF | Scale | Minimum | Maximum |
|
|
// | Bandwidth | | | Lag | Lag |
|
|
// +------------+------------------------+-------+----------+----------+
|
|
// | NB | {64, 64, 64, 64}/256 | 4 | 16 | 144 |
|
|
// | | | | | |
|
|
// | MB | {43, 42, 43, 43, 42, | 6 | 24 | 216 |
|
|
// | | 43}/256 | | | |
|
|
// | | | | | |
|
|
// | WB | {32, 32, 32, 32, 32, | 8 | 32 | 288 |
|
|
// | | 32, 32, 32}/256 | | | |
|
|
// +------------+------------------------+-------+----------+----------+
|
|
|
|
// Table 30: PDF for Low Part of Primary Pitch Lag
|
|
var (
|
|
lowPartICDF []uint
|
|
lagScale uint32
|
|
)
|
|
switch bandwidth {
|
|
case BandwidthNarrowband:
|
|
lowPartICDF = icdfPrimaryPitchLagLowPartNarrowband
|
|
lagScale = 4
|
|
lagMin = 16
|
|
lagMax = 144
|
|
case BandwidthMediumband:
|
|
lowPartICDF = icdfPrimaryPitchLagLowPartMediumband
|
|
lagScale = 6
|
|
lagMin = 24
|
|
lagMax = 216
|
|
case BandwidthWideband:
|
|
lowPartICDF = icdfPrimaryPitchLagLowPartWideband
|
|
lagScale = 8
|
|
lagMin = 32
|
|
lagMax = 288
|
|
}
|
|
|
|
lagHigh := d.rangeDecoder.DecodeSymbolWithICDF(icdfPrimaryPitchLagHighPart)
|
|
lagLow := d.rangeDecoder.DecodeSymbolWithICDF(lowPartICDF)
|
|
|
|
// The final primary pitch lag is then
|
|
//
|
|
// lag = lag_high*lag_scale + lag_low + lag_min
|
|
//
|
|
// where lag_high is the high part, lag_low is the low part, and
|
|
// lag_scale and lag_min are the values from the "Scale" and "Minimum
|
|
// Lag" columns of Table 30, respectively.
|
|
lag = lagHigh*lagScale + lagLow + lagMin
|
|
} else {
|
|
// TODO
|
|
}
|
|
|
|
// After the primary pitch lag, a "pitch contour", stored as a single
|
|
// entry from one of four small VQ codebooks, gives lag offsets for each
|
|
// subframe in the current SILK frame. The codebook index is decoded
|
|
// using one of the PDFs in Table 32 depending on the current frame size
|
|
// and audio bandwidth. Tables 33 through 36 give the corresponding
|
|
// offsets to apply to the primary pitch lag for each subframe given the
|
|
// decoded codebook index.
|
|
//
|
|
// +-----------+--------+----------+-----------------------------------+
|
|
// | Audio | SILK | Codebook | PDF |
|
|
// | Bandwidth | Frame | Size | |
|
|
// | | Size | | |
|
|
// +-----------+--------+----------+-----------------------------------+
|
|
// | NB | 10 ms | 3 | {143, 50, 63}/256 |
|
|
// | | | | |
|
|
// | NB | 20 ms | 11 | {68, 12, 21, 17, 19, 22, 30, 24, |
|
|
// | | | | 17, 16, 10}/256 |
|
|
// | | | | |
|
|
// | MB or WB | 10 ms | 12 | {91, 46, 39, 19, 14, 12, 8, 7, 6, |
|
|
// | | | | 5, 5, 4}/256 |
|
|
// | | | | |
|
|
// | MB or WB | 20 ms | 34 | {33, 22, 18, 16, 15, 14, 14, 13, |
|
|
// | | | | 13, 10, 9, 9, 8, 6, 6, 6, 5, 4, |
|
|
// | | | | 4, 4, 3, 3, 3, 2, 2, 2, 2, 2, 2, |
|
|
// | | | | 2, 1, 1, 1, 1}/256 |
|
|
// +-----------+--------+----------+-----------------------------------+
|
|
//
|
|
// Table 32: PDFs for Subframe Pitch Contour
|
|
|
|
// The final pitch lag for each subframe is assembled in
|
|
// silk_decode_pitch() (decode_pitch.c). Let lag be the primary pitch
|
|
// lag for the current SILK frame, contour_index be index of the VQ
|
|
// codebook, and lag_cb[contour_index][k] be the corresponding entry of
|
|
// the codebook from the appropriate table given above for the k'th
|
|
// subframe.
|
|
|
|
var (
|
|
lagCb [][]int8
|
|
lagIcdf []uint
|
|
)
|
|
|
|
switch bandwidth {
|
|
case BandwidthNarrowband:
|
|
lagCb = codebookSubframePitchCounterNarrowband20Ms
|
|
lagIcdf = icdfSubframePitchContourNarrowband20Ms
|
|
case BandwidthMediumband, BandwidthWideband:
|
|
lagCb = codebookSubframePitchCounterMediumbandOrWideband20Ms
|
|
lagIcdf = icdfSubframePitchContourMediumbandOrWideband20Ms
|
|
}
|
|
|
|
contourIndex := d.rangeDecoder.DecodeSymbolWithICDF(lagIcdf)
|
|
|
|
// Then the final pitch lag for that subframe is
|
|
//
|
|
// pitch_lags[k] = clamp(lag_min, lag + lag_cb[contour_index][k],
|
|
// lag_max)
|
|
pitchLags = make([]int, subframeCount)
|
|
for i := 0; i < subframeCount; i++ {
|
|
pitchLags[i] = int(clamp(
|
|
int32(lagMin),
|
|
int32(lag+uint32(lagCb[contourIndex][i])),
|
|
int32(lagMax)),
|
|
)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// This allows the encoder to trade off the prediction gain between
|
|
// packets against the recovery time after packet loss.
|
|
//
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.3
|
|
func (d *Decoder) decodeLTPScalingParamater(signalType frameSignalType) (LTPscaleQ14 float64) {
|
|
// An LTP scaling parameter appears after the LTP filter coefficients if
|
|
// and only if
|
|
//
|
|
// o This is a voiced frame (see Section 4.2.7.3), and
|
|
// o Either
|
|
// * This SILK frame corresponds to the first time interval of the
|
|
// current Opus frame for its type (LBRR or regular), or
|
|
//
|
|
// * This is an LBRR frame where the LBRR flags (see Section 4.2.4)
|
|
// indicate the previous LBRR frame in the same channel is not
|
|
// coded.
|
|
|
|
// Frames that do not code the scaling parameter
|
|
// use the default factor of 15565 (approximately 0.95).
|
|
if signalType != frameSignalTypeVoiced {
|
|
return 15565.0
|
|
}
|
|
|
|
// The three possible values represent Q14 scale factors of
|
|
// 15565, 12288, and 8192, respectively (corresponding to approximately
|
|
// 0.95, 0.75, and 0.5)
|
|
scaleFactorIndex := d.rangeDecoder.DecodeSymbolWithICDF(icdfLTPScalingParameter)
|
|
switch scaleFactorIndex {
|
|
case 0:
|
|
return 15565.0
|
|
case 1:
|
|
return 12288.0
|
|
case 2:
|
|
return 8192.0
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
// SILK uses a separate 5-tap pitch filter for each subframe, selected
|
|
// from one of three codebooks.
|
|
//
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.2
|
|
func (d *Decoder) decodeLTPFilterCoefficients(signalType frameSignalType) (bQ7 []int8) {
|
|
if signalType != frameSignalTypeVoiced {
|
|
return
|
|
}
|
|
|
|
bQ7 = make([]int8, 5)
|
|
|
|
// This is signaled with an explicitly-coded "periodicity index". This
|
|
// immediately follows the subframe pitch lags, and is coded using the
|
|
// 3-entry PDF from Table 37.
|
|
periodicityIndex := d.rangeDecoder.DecodeSymbolWithICDF(icdfPeriodicityIndex)
|
|
|
|
// The indices of the filters for each subframe follow. They are all
|
|
// coded using the PDF from Table 38 corresponding to the periodicity
|
|
// index. Tables 39 through 41 contain the corresponding filter taps as
|
|
// signed Q7 integers.
|
|
var filterIndiceIcdf []uint
|
|
switch periodicityIndex {
|
|
case 0:
|
|
filterIndiceIcdf = icdfLTPFilterIndex0
|
|
case 1:
|
|
filterIndiceIcdf = icdfLTPFilterIndex1
|
|
case 2:
|
|
filterIndiceIcdf = icdfLTPFilterIndex2
|
|
}
|
|
|
|
filterIndex := d.rangeDecoder.DecodeSymbolWithICDF(filterIndiceIcdf)
|
|
var LTPFilterCodebook [][]int8
|
|
|
|
switch periodicityIndex {
|
|
case 0:
|
|
LTPFilterCodebook = codebookLTPFilterPeriodicityIndex0
|
|
case 1:
|
|
LTPFilterCodebook = codebookLTPFilterPeriodicityIndex1
|
|
case 2:
|
|
LTPFilterCodebook = codebookLTPFilterPeriodicityIndex2
|
|
|
|
}
|
|
|
|
copy(bQ7, LTPFilterCodebook[filterIndex])
|
|
return
|
|
}
|
|
|
|
// let n be the number of samples in a subframe (40 for NB, 60 for
|
|
// MB, and 80 for WB)
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9
|
|
func (d *Decoder) samplesInSubframe(bandwidth Bandwidth) int {
|
|
switch bandwidth {
|
|
case BandwidthNarrowband:
|
|
return 40
|
|
case BandwidthMediumband:
|
|
return 60
|
|
case BandwidthWideband:
|
|
return 80
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.1
|
|
func (d *Decoder) ltpSynthesis(
|
|
signalType frameSignalType,
|
|
pitchLags []int,
|
|
eQ23 []int32,
|
|
n, j, s int,
|
|
LTPscaleQ14 float64,
|
|
bandwidth Bandwidth,
|
|
wQ2 int16,
|
|
) (res []float64) {
|
|
// For unvoiced frames (see Section 4.2.7.3), the LPC residual for i
|
|
// such that j <= i < (j + n) is simply a normalized copy of the
|
|
// excitation signal, i.e.,
|
|
//
|
|
// e_Q23[i]
|
|
// res[i] = ---------
|
|
// 2.0**23
|
|
|
|
res = make([]float64, len(eQ23))
|
|
if signalType != frameSignalTypeVoiced {
|
|
for i := j; i < (j + n); i++ {
|
|
res[i] = float64(eQ23[i]) / 8388608
|
|
}
|
|
return
|
|
}
|
|
|
|
// Voiced SILK frames, on the other hand, pass the excitation through an
|
|
// LTP filter using the parameters decoded in Section 4.2.7.6 to produce
|
|
// an LPC residual.
|
|
return
|
|
}
|
|
|
|
// LPC synthesis uses the short-term LPC filter to predict the next
|
|
// output coefficient. For i such that (j - d_LPC) <= i < j, let lpc[i]
|
|
// be the result of LPC synthesis from the last d_LPC samples of the
|
|
// previous subframe or zeros in the first subframe for this channel
|
|
// after either
|
|
//
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.2
|
|
func (d *Decoder) lpcSynthesis(out []float64, bandwidth Bandwidth, n, s, dLPC int, aQ12, res, gainQ16 []float64) {
|
|
finalLPCValuesIndex := 0
|
|
|
|
// j be the index of the first sample in the residual corresponding to
|
|
// the current subframe.
|
|
j := 0
|
|
|
|
// let lpc[i] be the result of LPC synthesis from the last d_LPC samples of the
|
|
// previous subframe or zeros in the first subframe for this channel
|
|
lpc := make([]float64, n)
|
|
|
|
//Then, for i such that j <= i < (j + n), the result of LPC synthesis
|
|
//for the current subframe is
|
|
//
|
|
// d_LPC-1
|
|
// gain_Q16[i] __ a_Q12[k]
|
|
// lpc[i] = ----------- * res[i] + \ lpc[i-k-1] * --------
|
|
// 65536.0 /_ 4096.0
|
|
// k=0
|
|
//
|
|
var currentLPCVal float64
|
|
for i := j; i < (j + n); i++ {
|
|
lpcVal := gainQ16[s] / 65536.0
|
|
lpcVal *= res[i+(n*s)]
|
|
|
|
for k := 0; k < dLPC; k++ {
|
|
if i-k > 0 {
|
|
currentLPCVal = lpc[i-k-1]
|
|
} else {
|
|
currentLPCVal = d.finalLPCValues[len(d.finalLPCValues)-1+(i-k)]
|
|
}
|
|
|
|
lpcVal += currentLPCVal * (aQ12[k] / 4096.0)
|
|
}
|
|
|
|
lpc[i] = lpcVal
|
|
|
|
// The decoder saves the final d_LPC values, i.e., lpc[i] such that
|
|
// (j + n - d_LPC) <= i < (j + n), to feed into the LPC synthesis of the
|
|
// next subframe. This requires storage for up to 16 values of lpc[i]
|
|
// (for WB frames).
|
|
if (j+n-dLPC) <= i && i < (j+n) {
|
|
d.finalLPCValues[finalLPCValuesIndex] = lpcVal
|
|
finalLPCValuesIndex++
|
|
}
|
|
|
|
// Then, the signal is clamped into the final nominal range:
|
|
//
|
|
// out[i] = clamp(-1.0, lpc[i], 1.0)
|
|
//
|
|
out[i] = clampFloat(-1.0, lpc[i], 1.0)
|
|
}
|
|
}
|
|
|
|
// The remainder of the reconstruction process for the frame does not
|
|
// need to be bit-exact, as small errors should only introduce
|
|
// proportionally small distortions. Although the reference
|
|
// implementation only includes a fixed-point version of the remaining
|
|
// steps, this section describes them in terms of a floating-point
|
|
// version for simplicity. This produces a signal with a nominal range
|
|
// of -1.0 to 1.0.
|
|
//
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9
|
|
func (d *Decoder) silkFrameReconstruction(
|
|
signalType frameSignalType, bandwidth Bandwidth,
|
|
dLPC int,
|
|
pitchLags []int,
|
|
eQ23 []int32,
|
|
LTPscaleQ14 float64,
|
|
wQ2 int16,
|
|
aQ12, gainQ16, out []float64,
|
|
) {
|
|
// let n be the number of samples in a subframe
|
|
//
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9
|
|
n := d.samplesInSubframe(bandwidth)
|
|
|
|
// s be the index of the current subframe in this SILK frame
|
|
// (0 or 1 for 10 ms frames, or 0 to 3 for 20 ms frames)
|
|
for s := 0; s < subframeCount; s++ {
|
|
// j be the index of the first sample in the residual corresponding to
|
|
// the current subframe.
|
|
//
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9
|
|
j := n * s
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.1
|
|
res := d.ltpSynthesis(signalType, pitchLags, eQ23, n, j, s, LTPscaleQ14, bandwidth, wQ2)
|
|
|
|
//https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.2
|
|
d.lpcSynthesis(out[n*s:], bandwidth, n, s, dLPC, aQ12, res, gainQ16)
|
|
}
|
|
}
|
|
|
|
// Decode decodes many SILK subframes
|
|
//
|
|
// An overview of the decoder is given in Figure 14.
|
|
//
|
|
// +---------+ +------------+
|
|
// -->| Range |--->| Decode |---------------------------+
|
|
// 1 | Decoder | 2 | Parameters |----------+ 5 |
|
|
// +---------+ +------------+ 4 | |
|
|
// 3 | | |
|
|
// \/ \/ \/
|
|
// +------------+ +------------+ +------------+
|
|
// | Generate |-->| LTP |-->| LPC |
|
|
// | Excitation | | Synthesis | | Synthesis |
|
|
// +------------+ +------------+ +------------+
|
|
// ^ |
|
|
// | |
|
|
// +-------------------+----------------+
|
|
// | 6
|
|
// | +------------+ +-------------+
|
|
// +-->| Stereo |-->| Sample Rate |-->
|
|
// | Unmixing | 7 | Conversion | 8
|
|
// +------------+ +-------------+
|
|
//
|
|
// 1: Range encoded bitstream
|
|
// 2: Coded parameters
|
|
// 3: Pulses, LSBs, and signs
|
|
// 4: Pitch lags, Long-Term Prediction (LTP) coefficients
|
|
// 5: Linear Predictive Coding (LPC) coefficients and gains
|
|
// 6: Decoded signal (mono or mid-side stereo)
|
|
// 7: Unmixed signal (mono or left-right stereo)
|
|
// 8: Resampled signal
|
|
//
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.1
|
|
func (d *Decoder) Decode(in []byte, out []float64, isStereo bool, nanoseconds int, bandwidth Bandwidth) error {
|
|
subframeSize := d.samplesInSubframe(bandwidth)
|
|
switch {
|
|
case nanoseconds != nanoseconds20Ms:
|
|
return errUnsupportedSilkFrameDuration
|
|
case isStereo:
|
|
return errUnsupportedSilkStereo
|
|
case (subframeSize * subframeCount) > len(out):
|
|
return errOutBufferTooSmall
|
|
}
|
|
|
|
d.rangeDecoder.Init(in)
|
|
|
|
voiceActivityDetected, lowBitRateRedundancy := d.decodeHeaderBits()
|
|
if lowBitRateRedundancy {
|
|
return errUnsupportedSilkLowBitrateRedundancy
|
|
}
|
|
|
|
signalType, quantizationOffsetType := d.determineFrameType(voiceActivityDetected)
|
|
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.4
|
|
gainQ16 := d.decodeSubframeQuantizations(signalType)
|
|
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.1
|
|
I1 := d.normalizeLineSpectralFrequencyStageOne(signalType == frameSignalTypeVoiced, bandwidth)
|
|
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.2
|
|
dLPC, resQ10 := d.normalizeLineSpectralFrequencyStageTwo(bandwidth, I1)
|
|
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.3
|
|
nlsfQ15 := d.normalizeLineSpectralFrequencyCoefficients(dLPC, bandwidth, resQ10, I1)
|
|
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.4
|
|
d.normalizeLSFStabilization(nlsfQ15)
|
|
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.5
|
|
n1Q15, wQ2 := d.normalizeLSFInterpolation(nlsfQ15)
|
|
|
|
// https://datatracker.ietf.org/doc/html/rfc6716#section-4.2.7.5.6
|
|
a32Q17 := d.convertNormalizedLSFsToLPCCoefficients(n1Q15, bandwidth)
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.5.7
|
|
d.limitLPCCoefficientsRange(a32Q17)
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.5.8
|
|
aQ12 := d.limitLPCFilterPredictionGain(a32Q17)
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.1
|
|
_, pitchLags := d.decodePitchLags(signalType, bandwidth)
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.2
|
|
d.decodeLTPFilterCoefficients(signalType)
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.3
|
|
LTPscaleQ14 := d.decodeLTPScalingParamater(signalType)
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.7
|
|
lcgSeed := d.decodeLinearCongruentialGeneratorSeed()
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.8
|
|
shellblocks := d.decodeShellblocks(nanoseconds, bandwidth)
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.8.1
|
|
rateLevel := d.decodeRatelevel(signalType == frameSignalTypeVoiced)
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.8.2
|
|
pulsecounts, lsbcounts := d.decodePulseAndLSBCounts(shellblocks, rateLevel)
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.8.6
|
|
eQ23 := d.decodeExcitation(signalType, quantizationOffsetType, lcgSeed, pulsecounts, lsbcounts)
|
|
|
|
// https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9
|
|
d.silkFrameReconstruction(signalType, bandwidth,
|
|
dLPC,
|
|
pitchLags,
|
|
eQ23,
|
|
LTPscaleQ14,
|
|
wQ2,
|
|
aQ12, gainQ16, out,
|
|
)
|
|
|
|
// n0Q15 is the LSF coefficients decoded for the prior frame
|
|
// see normalizeLSFInterpolation.
|
|
if len(d.n0Q15) != len(nlsfQ15) {
|
|
d.n0Q15 = make([]int16, len(nlsfQ15))
|
|
}
|
|
|
|
copy(d.n0Q15, nlsfQ15)
|
|
d.isPreviousFrameVoiced = signalType == frameSignalTypeVoiced
|
|
d.haveDecoded = true
|
|
|
|
return nil
|
|
}
|