Remove writes to memory in sse4.1 parsing code, fix edge case (#588)

This commit is contained in:
redzic 2022-03-11 13:21:12 -06:00 committed by GitHub
parent 2efb7daa73
commit 9242b036d0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 33 additions and 67 deletions

View file

@ -493,12 +493,7 @@ impl Encoder {
}
/// Parses the number of encoded frames
///
/// # Safety
///
/// The caller should not attempt to read the contents of `line` after
/// this function has been called.
pub(crate) unsafe fn parse_encoded_frames(self, line: &mut str) -> Option<u64> {
pub(crate) fn parse_encoded_frames(self, line: &str) -> Option<u64> {
use crate::parse::*;
match self {
@ -506,7 +501,7 @@ impl Encoder {
cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
if is_x86_feature_detected!("sse4.1") && is_x86_feature_detected!("ssse3") {
return parse_aom_vpx_frames_sse41(line.as_bytes_mut());
return unsafe { parse_aom_vpx_frames_sse41(line.as_bytes()) };
}
}
}

View file

@ -39,13 +39,19 @@ pub fn parse_aom_vpx_frames(s: &str) -> Option<u64> {
// The numbers for aomenc/vpxenc are buffered/encoded frames, so we want the
// second number (actual encoded frames)
let first_digit_index = s
.as_bytes()
.get(AOM_VPX_IGNORED_PREFIX.len() - 1..)?
.iter()
.position(|&c| c == b'/')?;
let first_space_index = s
.get(AOM_VPX_IGNORED_PREFIX.len()..)?
.get(AOM_VPX_IGNORED_PREFIX.len() + first_digit_index..)?
.as_bytes()
.iter()
.position(|&c| c == b' ')?;
.position(|&c| c == b' ')?
+ first_digit_index;
let first_digit_index = (first_space_index / 2).saturating_sub(2);
s.get(
AOM_VPX_IGNORED_PREFIX.len() + first_digit_index
..AOM_VPX_IGNORED_PREFIX.len() + first_space_index,
@ -63,12 +69,11 @@ pub fn parse_aom_vpx_frames(s: &str) -> Option<u64> {
///
/// # Safety
///
/// The caller should not attempt to read the contents of `s` after this
/// function has been called.
/// The CPU must support SSSE3 and SSE4.1.
#[inline]
#[target_feature(enable = "ssse3,sse4.1")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub unsafe fn parse_aom_vpx_frames_sse41(s: &mut [u8]) -> Option<u64> {
pub unsafe fn parse_aom_vpx_frames_sse41(s: &[u8]) -> Option<u64> {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
@ -154,52 +159,17 @@ pub unsafe fn parse_aom_vpx_frames_sse41(s: &mut [u8]) -> Option<u64> {
// actual first space character.
let first_space_index = mask16.trailing_zeros() as usize;
// It is impossible that `first_space_index == 0` for valid aomenc output, since the
// first character after the ignored prefix has to be a digit.
//
// All indexes are relative to `relevant_output`.
// ↓ Since the first digit occurs here, its index = 0
// Pass 1/1 frame 3/2 2131B 5997 us 500.25 fps [ETA unknown]
// * ^ n = 1, first_digit_index = 0 (the first character is the first digit)
// ↑
// ╰ end of ignored prefix (continued below)
//
// Pass 1/1 frame 84/83 81091B 132314 us 634.85 fps [ETA unknown]
// * ^ n = 2, first_digit_index = 0
//
// Pass 1/1 frame 142/141 156465B 208875 us 679.83 fps [ETA unknown]
// * ^ n = 3, first_digit_index = 0
//
// Pass 1/1 frame 4232/4231 5622510B 5518075 us 766.93 fps [ETA unknown]
// * ^ n = 4, first_digit_index = 0
//
// Pass 1/1 frame 13380/13379 17860525B 16760 ms 798.31 fps [ETA unknown]
// * ^ n = 6, first_digit_index = 1
//
// Pass 1/1 frame 102262/102261 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F
// * ^ n = 8, first_digit_index = 2
// ^^^^^^^^
// 12345678
// n = 8 signifies that there are 8 characters before the first space.
// This also happens to be first_space_index.
//
// Pass 1/1 frame 1022621/1022611 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F
// * ^ n = 10, first_digit_index = 3
//
// Solving a linear equation for n and first_digit_index yields this
// formula, but first_digit_index cannot be negative so we use saturating_sub.
let first_digit_index = (first_space_index / 2).saturating_sub(2);
let first_digit_index = {
_mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(s.get(AOM_VPX_IGNORED_PREFIX.len() - 1..)?.as_ptr().cast()),
_mm_set1_epi8(b'/' as i8),
))
.trailing_zeros() as usize
};
// Set `CHUNK_SIZE` bytes before the real first digit index (including the ignored prefix)
// to b'0'. Unconditionally zeroing `CHUNK_SIZE` bytes is better than only setting the
// bytes that are absolutely necessary because using a fixed size allows LLVM to avoid
// a call to memset and instead use movaps/movups.
for byte in s
.get_unchecked_mut(AOM_VPX_IGNORED_PREFIX.len() + first_digit_index - CHUNK_SIZE..)
.get_unchecked_mut(..CHUNK_SIZE)
{
*byte = b'0';
}
let ascending = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let num_digits = first_space_index - first_digit_index;
let dynamic_mask = _mm_cmplt_epi8(ascending, _mm_set1_epi8(num_digits as i8));
// At this point, we have done all the setup and can use the actual SIMD integer
// parsing algorithm. The description of the algorithm can be found here:
@ -213,6 +183,9 @@ pub unsafe fn parse_aom_vpx_frames_sse41(s: &mut [u8]) -> Option<u64> {
let zeros = _mm_set1_epi8(b'0' as i8);
chunk = _mm_sub_epi8(chunk, zeros);
// Mask out the irrelevant bits, effectively parsing them as if they were 0.
chunk = _mm_and_si128(chunk, dynamic_mask);
let mult = _mm_set_epi8(1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10);
chunk = _mm_maddubs_epi16(chunk, mult);
let mult = _mm_set_epi16(1, 100, 1, 100, 1, 100, 1, 100);
@ -444,7 +417,7 @@ mod tests {
#[test]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn aom_vpx_sse41_parsing() {
fn aom_vpx_parsing() {
let test_cases = [
(
"Pass 1/1 frame 3/2 2131B 5997 us 500.25 fps [ETA unknown]",
@ -483,13 +456,15 @@ mod tests {
"Pass 1/1 frame 1022621/1022611 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F",
Some(1_022_611),
),
(
"Pass 1/1 frame 10000/9999 5319227B 8663074 us 1154.32 fps [ETA unknown]",
Some(9999),
),
];
if is_x86_feature_detected!("sse4.1") && is_x86_feature_detected!("ssse3") {
for (s, ans) in test_cases {
let mut s = String::from(s);
assert_eq!(unsafe { parse_aom_vpx_frames_sse41(s.as_bytes_mut()) }, ans);
assert_eq!(unsafe { parse_aom_vpx_frames_sse41(s.as_bytes()) }, ans);
}
}

View file

@ -397,11 +397,7 @@ impl EncodeArgs {
enc_stderr.push_str(line);
enc_stderr.push('\n');
// SAFETY: we do not read the contents of `line` after this function call
if let Some(new) = unsafe { self.encoder.parse_encoded_frames(line) } {
// clippy is actually wrong that this does nothing, dropping a mutable
// reference does prevent it from being used again.
drop(line);
if let Some(new) = self.encoder.parse_encoded_frames(line) {
if new > frame {
if self.verbosity == Verbosity::Normal {
inc_bar((new - frame) as u64);