Implement SSE4.1/SSSE3 SIMD-accelerated frame parsing for aomenc/vpxenc (#419)

The SSE4.1/SSSE3 implementation of the parsing routines for
aomenc/vpxenc outperform the old regex by a factor of about 5000x
(3.5 μs vs. 0.7 ns to parse a single line on a Zen 1 1950x).

The idea is to eventually remove regex as a dependency entirely,
especially since there is an open PR to indicatif that speeds up the
template expansion (which replaces regexes with hand-written parsing).
This commit is contained in:
redzic 2021-11-20 10:35:31 -06:00 committed by GitHub
parent 65772ba4c7
commit 5a23cc3565
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 297 additions and 26 deletions

8
Cargo.lock generated
View file

@ -58,9 +58,9 @@ dependencies = [
[[package]]
name = "anyhow"
version = "1.0.45"
version = "1.0.47"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee10e43ae4a853c0a3591d4e2ada1719e553be18199d9da9d4a83f5927c2f5c7"
checksum = "38d9ff5d688f1c13395289f67db01d4826b46dd694e7580accdc3e8430f2d98e"
[[package]]
name = "arbitrary"
@ -70,9 +70,9 @@ checksum = "db55d72333851e17d572bec876e390cd3b11eb1ef53ae821dd9f3b653d2b4569"
[[package]]
name = "arg_enum_proc_macro"
version = "0.3.1"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c8e0146624e9b300cfeb394a758bd23e2fe494579c6363413354f04e8b49261"
checksum = "d7c29b43ee8654590587cd033b3eca2f9c4f8cdff945ec0e6ee91ceb057d87f3"
dependencies = [
"proc-macro2",
"quote",

View file

@ -263,11 +263,11 @@ pub struct CliOpts {
#[structopt(long)]
pub probe_slow: bool,
/// Min q for target quality
/// Min Q for target quality
#[structopt(long)]
pub min_q: Option<u32>,
/// Max q for target quality
/// Max Q for target quality
#[structopt(long)]
pub max_q: Option<u32>,

View file

@ -11,6 +11,7 @@ use std::{
sync::mpsc::Sender,
};
use cfg_if::cfg_if;
use smallvec::SmallVec;
use thiserror::Error;
@ -103,7 +104,7 @@ impl<'a> Broker<'a> {
}
drop(sender);
cfg_if::cfg_if! {
cfg_if! {
if #[cfg(any(target_os = "linux", target_os = "windows"))] {
if let Some(threads) = set_thread_affinity {
let available_threads = num_cpus::get();
@ -129,7 +130,7 @@ impl<'a> Broker<'a> {
.map(|(rx, queue, worker_id)| {
let tx = tx.clone();
s.spawn(move |_| {
cfg_if::cfg_if! {
cfg_if! {
if #[cfg(any(target_os = "linux", target_os = "windows"))] {
if let Some(threads) = set_thread_affinity {
let mut cpu_set = SmallVec::<[usize; 16]>::new();

View file

@ -1,7 +1,7 @@
use crate::{ffmpeg::compose_ffmpeg_pipe, into_vec, list_index, regex};
use cfg_if::cfg_if;
use ffmpeg_next::format::Pixel;
use itertools::chain;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::{borrow::Cow, cmp, fmt::Display, path::PathBuf};
use thiserror::Error;
@ -327,7 +327,7 @@ impl Encoder {
}
}
/// Returns function pointer used for matching q/crf arguments in command line
/// Returns function pointer used for matching Q/CRF arguments in command line
fn q_match_fn(self) -> fn(&str) -> bool {
match self {
Self::aom | Self::vpx => |p| p.starts_with("--cq-level="),
@ -356,22 +356,35 @@ impl Encoder {
new_params
}
/// Retuns regex for matching encoder progress in cli
fn pipe_match(self) -> &'static Regex {
match self {
Self::aom | Self::vpx => regex!(r".*Pass (?:1/1|2/2) .*frame.*?/([^ ]+?) "),
/// Parses the number of encoded frames
///
/// # Safety
///
/// The caller should not attempt to read the contents of `line` after
/// this function has been called.
pub(crate) unsafe fn parse_encoded_frames(self, line: &mut str) -> Option<u64> {
let regex = match self {
Self::aom | Self::vpx => {
cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
if is_x86_feature_detected!("sse4.1") && is_x86_feature_detected!("ssse3") {
return crate::simd::parse_aom_vpx_frames_sse41(line.as_bytes_mut());
}
}
}
// The numbers for aomenc/vpxenc are buffered/encoded frames, so we want the
// second number (actual encoded frames)
regex!(r".*Pass (?:1/1|2/2) .*frame.*?/([^ ]+?) ")
}
Self::rav1e => regex!(r"encoded.*? ([^ ]+?) "),
Self::svt_av1 => regex!(r"Encoding frame\s+(\d+)"),
Self::x264 => regex!(r"^[^\d]*(\d+)"),
Self::x265 => regex!(r"(\d+) frames"),
}
}
};
/// Returs option of q/crf value from cli encoder output
pub fn match_line(self, line: &str) -> Option<usize> {
let encoder_regex = self.pipe_match();
let captures = encoder_regex.captures(line)?.get(1)?.as_str();
captures.parse::<usize>().ok()
let captures = regex.captures(line)?.get(1)?.as_str();
captures.parse().ok()
}
/// Returns command used for target quality probing

View file

@ -47,6 +47,7 @@ pub mod ffmpeg;
pub mod progress_bar;
pub mod scene_detect;
pub mod settings;
pub(crate) mod simd;
pub mod split;
pub mod target_quality;
pub mod util;

View file

@ -36,6 +36,7 @@ fn pretty_progress_style() -> ProgressStyle {
/// Enables steady 100 ms tick
pub fn init_progress_bar(len: u64) {
let pb = PROGRESS_BAR.get_or_init(|| ProgressBar::new(len).with_style(pretty_progress_style()));
pb.set_draw_target(ProgressDrawTarget::stderr());
pb.enable_steady_tick(100);
pb.reset();
pb.reset_eta();

View file

@ -312,11 +312,18 @@ impl EncodeArgs {
break;
}
if let Ok(line) = simdutf8::basic::from_utf8(&buf) {
if let Ok(line) = simdutf8::basic::from_utf8_mut(&mut buf) {
if self.verbosity == Verbosity::Verbose && !line.contains('\n') {
update_mp_msg(worker_id, line.to_string());
}
if let Some(new) = self.encoder.match_line(line) {
// This needs to be done before parse_encoded_frames, as it potentially
// mutates the string
enc_stderr.push_str(line);
enc_stderr.push('\n');
// SAFETY: we do not read the contents of `line` after this function call
if let Some(new) = unsafe { self.encoder.parse_encoded_frames(line) } {
drop(line);
if new > frame {
if self.verbosity == Verbosity::Normal {
update_bar((new - frame) as u64);
@ -326,8 +333,6 @@ impl EncodeArgs {
frame = new;
}
}
enc_stderr.push_str(line);
enc_stderr.push('\n');
}
buf.clear();
@ -898,7 +903,7 @@ properly into a mkv file. Specify mkvmerge as the concatenation method by settin
self.workers = cmp::min(self.workers, chunk_queue.len());
eprintln!(
"{}{} {} {}{} {} {}{} {}\n{}: {}\n",
"{}{} {} {}{} {} {}{} {}\n{}: {}",
Color::Green.bold().paint("Q"),
Color::Green.paint("ueue"),
Color::Green.bold().paint(chunk_queue.len().to_string()),

250
av1an-core/src/simd.rs Normal file
View file

@ -0,0 +1,250 @@
//! SIMD-optimized functions for parsing
/// x86 SIMD implementation of parsing aomenc/vpxenc output using
/// SSSE3+SSE4.1, returning the number of frames processed, or `None`
/// if the input did not match.
///
/// This function also works for parsing vpxenc output, as its progress
/// printing is exactly the same.
///
/// # Safety
///
/// The caller should not attempt to read the contents of `s` after this
/// function has been called.
#[inline]
#[target_feature(enable = "ssse3,sse4.1")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub(crate) unsafe fn parse_aom_vpx_frames_sse41(s: &mut [u8]) -> Option<u64> {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use std::mem::transmute;
// This implementation matches the *second* number in the output. Ex:
// Pass 1/1 frame 142/141 156465B 208875 us 679.83 fps [ETA unknown]
// ^^^
// matches this number and returns `Some(141)`
//
// Pass 1/1 frame 102262/102261 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F
// ^^^^^^
// matches this number and returns `Some(102261)`
//
// If invalid input is detected, this function returns `None`.
// We cheat in this implementation by taking a mutable slice to the string
// so we can reuse its allocation to add padding zeroes for free.
// Number of bytes processed (size in bytes of xmm register)
//
// There is no benefit to using wider SIMD lanes in this case, so we just
// use the most commonly available SIMD width. This is because we want
// to parse the fewest number of bytes possible to get the correct result.
const CHUNK_SIZE: usize = 16;
// We can safely always ignore this prefix, as the second number will
// always be at some point after this prefix. See examples of aomenc
// output below to see why this is the case.
#[rustfmt::skip]
const IGNORED_PREFIX: &str =
"Pass x/x frame x/";
// Pass 1/1 frame 3/2 2131B 5997 us 500.25 fps [ETA unknown]
// ^ relevant output starts at this character
// Pass 1/1 frame 84/83 81091B 132314 us 634.85 fps [ETA unknown]
// ^
// Pass 1/1 frame 142/141 156465B 208875 us 679.83 fps [ETA unknown]
// ^
// Pass 1/1 frame 4232/4231 5622510B 5518075 us 766.93 fps [ETA unknown]
// ^
// Pass 1/1 frame 13380/13379 17860525B 16760 ms 798.31 fps [ETA unknown]
// ^
// Pass 1/1 frame 102262/102261 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F
// ^
// Pass 1/1 frame 1022621/1022611 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F
// ^
//
// As you can see, the relevant part of the output always starts past
// the length of the ignored prefix.
// This implementation needs to read `CHUNK_SIZE` bytes past the ignored
// prefix, so we pay the cost of the bounds check only once at the start
// of this function. This also serves as an input validation check.
if s.len() < IGNORED_PREFIX.len() + CHUNK_SIZE {
return None;
}
// Sanity check to see if we should parse the line. Processing invalid input
// anyway would result in returning a garbage value, ultimately causing the
// frame counter to be completely off.
if !(s.starts_with(b"Pass 2/2") || s.starts_with(b"Pass 1/1")) {
return None;
}
// Since the aomenc output follows a particular pattern, we can calculate the
// position of the '/' character from the index of the first space (how to
// do so is explained later on). We create this mask to find the first space
// in the output.
let spaces = _mm_set1_epi8(b' ' as i8);
// Load the relevant part of the output, which are the 16 bytes after the ignored prefix.
// This is safe because we already asserted that at least `IGNORED_PREFIX.len() + CHUNK_SIZE`
// bytes are available, and `_mm_loadu_si128` loads `CHUNK_SIZE` (16) bytes.
let relevant_output =
_mm_loadu_si128(s.get_unchecked(IGNORED_PREFIX.len()..).as_ptr() as *const _);
// Compare the relevant output to spaces to create a mask where each bit
// is set to 1 if the corresponding character was a space, and 0 otherwise.
// The LSB corresponds to the match between the first characters.
//
// Only the lower 16 bits are relevant, as the rest are always set to 0.
let mask16 = _mm_movemask_epi8(_mm_cmpeq_epi8(relevant_output, spaces));
// The bits in the mask are set as so:
//
// "141 156465B 208875 us 679.83 fps [ETA unknown]"
// mask: 110000000111000
// ^^^
// These bits correspond to the first 3 characters: "141".
// Since they do not match the space, they are set to 0 in the mask.
// As printed, the leftmost bit is the most significant bit.
// ^^^
// These bits correspond to the 3 spaces after the "141".
//
// "2/102261 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F"
// mask: 100000000
// ^^^^^^^^
// These bits correspond to the first 8 characters: "2/102261".
//
// To get the index of the first space, we need to get the trailing zeros,
// which correspond to the first characters.
//
// This value is such that `relevant_output[first_space_index]` gives the
// actual first space character.
let first_space_index = mask16.trailing_zeros() as usize;
// It is impossible that `first_space_index == 0` for valid aomenc output, since the
// first character after the ignored prefix has to be a digit.
//
// All indexes are relative to `relevant_output`.
// ↓ Since the first digit occurs here, its index = 0
// Pass 1/1 frame 3/2 2131B 5997 us 500.25 fps [ETA unknown]
// * ^ n = 1, first_digit_index = 0 (the first character is the first digit)
// ↑
// ╰ end of ignored prefix (continued below)
//
// Pass 1/1 frame 84/83 81091B 132314 us 634.85 fps [ETA unknown]
// * ^ n = 2, first_digit_index = 0
//
// Pass 1/1 frame 142/141 156465B 208875 us 679.83 fps [ETA unknown]
// * ^ n = 3, first_digit_index = 0
//
// Pass 1/1 frame 4232/4231 5622510B 5518075 us 766.93 fps [ETA unknown]
// * ^ n = 4, first_digit_index = 0
//
// Pass 1/1 frame 13380/13379 17860525B 16760 ms 798.31 fps [ETA unknown]
// * ^ n = 6, first_digit_index = 1
//
// Pass 1/1 frame 102262/102261 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F
// * ^ n = 8, first_digit_index = 2
// ^^^^^^^^
// 12345678
// n = 8 signifies that there are 8 characters before the first space.
// This also happens to be first_space_index.
//
// Pass 1/1 frame 1022621/1022611 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F
// * ^ n = 10, first_digit_index = 3
//
// Solving a linear equation for n and first_digit_index yields this
// formula, but first_digit_index cannot be negative so we use saturating_sub.
let first_digit_index = (first_space_index / 2).saturating_sub(2);
// Set `CHUNK_SIZE` bytes before the real first digit index (including the ignored prefix)
// to b'0'. Uncoditionally zeroing `CHUNK_SIZE` bytes is better than only setting the
// bytes that are absolutely necessary because using a fixed size allows LLVM to avoid
// a call to memset and instead use movaps/movups.
for byte in s
.get_unchecked_mut(IGNORED_PREFIX.len() + first_digit_index - CHUNK_SIZE..)
.get_unchecked_mut(..CHUNK_SIZE)
{
*byte = b'0';
}
// At this point, we have done all the setup and can use the actual SIMD integer
// parsing algorithm. The description of the algorithm can be found here:
// https://kholdstare.github.io/technical/2020/05/26/faster-integer-parsing.html
let mut chunk = _mm_loadu_si128(
s.as_ptr()
.add(IGNORED_PREFIX.len() + first_space_index - CHUNK_SIZE) as *const _,
);
let zeros = _mm_set1_epi8(b'0' as i8);
chunk = _mm_sub_epi8(chunk, zeros);
let mult = _mm_set_epi8(1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10);
chunk = _mm_maddubs_epi16(chunk, mult);
let mult = _mm_set_epi16(1, 100, 1, 100, 1, 100, 1, 100);
chunk = _mm_madd_epi16(chunk, mult);
chunk = _mm_packus_epi32(chunk, chunk);
let mult = _mm_set_epi16(0, 0, 0, 0, 1, 10000, 1, 10000);
chunk = _mm_madd_epi16(chunk, mult);
let chunk = transmute::<_, [u64; 2]>(chunk);
Some(((chunk[0] & 0xffffffff) * 100000000) + (chunk[0] >> 32))
}
#[cfg(test)]
mod tests {
use crate::simd::parse_aom_vpx_frames_sse41;
#[test]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn aom_vpx_sse41_parsing() {
let test_cases = [
(
"Pass 1/1 frame 3/2 2131B 5997 us 500.25 fps [ETA unknown]",
Some(2),
),
(
"Pass 2/2 frame 84/83 81091B 132314 us 634.85 fps [ETA unknown]",
Some(83),
),
(
"Pass 1/1 frame 142/141 156465B 208875 us 679.83 fps [ETA unknown]",
Some(141),
),
(
"Pass 1/2 frame 142/141 156465B 208875 us 679.83 fps [ETA unknown]",
None,
),
(
"Pass 2/2 frame 4232/4231 5622510B 5518075 us 766.93 fps [ETA unknown]",
Some(4231),
),
(
"Pass 1/1 frame 13380/13379 17860525B 16760 ms 798.31 fps [ETA unknown]",
Some(13379),
),
(
"Pass 1/2 frame 13380/13379 17860525B 16760 ms 798.31 fps [ETA unknown]",
None,
),
("invalid data", None),
(
"Pass 2/2 frame 102262/102261 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F",
Some(102261),
),
(
"Pass 1/1 frame 1022621/1022611 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F",
Some(1022611),
),
];
if is_x86_feature_detected!("sse4.1") {
for (s, ans) in test_cases {
let mut s = String::from(s);
assert_eq!(unsafe { parse_aom_vpx_frames_sse41(s.as_bytes_mut()) }, ans);
}
}
}
}