Implement SSE4.1/SSSE3 SIMD-accelerated frame parsing for aomenc/vpxenc (#419)
The SSE4.1/SSSE3 implementation of the parsing routines for aomenc/vpxenc outperform the old regex by a factor of about 5000x (3.5 μs vs. 0.7 ns to parse a single line on a Zen 1 1950x). The idea is to eventually remove regex as a dependency entirely, especially since there is an open PR to indicatif that speeds up the template expansion (which replaces regexes with hand-written parsing).
This commit is contained in:
parent
65772ba4c7
commit
5a23cc3565
8
Cargo.lock
generated
8
Cargo.lock
generated
|
@ -58,9 +58,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.45"
|
||||
version = "1.0.47"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee10e43ae4a853c0a3591d4e2ada1719e553be18199d9da9d4a83f5927c2f5c7"
|
||||
checksum = "38d9ff5d688f1c13395289f67db01d4826b46dd694e7580accdc3e8430f2d98e"
|
||||
|
||||
[[package]]
|
||||
name = "arbitrary"
|
||||
|
@ -70,9 +70,9 @@ checksum = "db55d72333851e17d572bec876e390cd3b11eb1ef53ae821dd9f3b653d2b4569"
|
|||
|
||||
[[package]]
|
||||
name = "arg_enum_proc_macro"
|
||||
version = "0.3.1"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c8e0146624e9b300cfeb394a758bd23e2fe494579c6363413354f04e8b49261"
|
||||
checksum = "d7c29b43ee8654590587cd033b3eca2f9c4f8cdff945ec0e6ee91ceb057d87f3"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
|
|
@ -263,11 +263,11 @@ pub struct CliOpts {
|
|||
#[structopt(long)]
|
||||
pub probe_slow: bool,
|
||||
|
||||
/// Min q for target quality
|
||||
/// Min Q for target quality
|
||||
#[structopt(long)]
|
||||
pub min_q: Option<u32>,
|
||||
|
||||
/// Max q for target quality
|
||||
/// Max Q for target quality
|
||||
#[structopt(long)]
|
||||
pub max_q: Option<u32>,
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@ use std::{
|
|||
sync::mpsc::Sender,
|
||||
};
|
||||
|
||||
use cfg_if::cfg_if;
|
||||
use smallvec::SmallVec;
|
||||
use thiserror::Error;
|
||||
|
||||
|
@ -103,7 +104,7 @@ impl<'a> Broker<'a> {
|
|||
}
|
||||
drop(sender);
|
||||
|
||||
cfg_if::cfg_if! {
|
||||
cfg_if! {
|
||||
if #[cfg(any(target_os = "linux", target_os = "windows"))] {
|
||||
if let Some(threads) = set_thread_affinity {
|
||||
let available_threads = num_cpus::get();
|
||||
|
@ -129,7 +130,7 @@ impl<'a> Broker<'a> {
|
|||
.map(|(rx, queue, worker_id)| {
|
||||
let tx = tx.clone();
|
||||
s.spawn(move |_| {
|
||||
cfg_if::cfg_if! {
|
||||
cfg_if! {
|
||||
if #[cfg(any(target_os = "linux", target_os = "windows"))] {
|
||||
if let Some(threads) = set_thread_affinity {
|
||||
let mut cpu_set = SmallVec::<[usize; 16]>::new();
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
use crate::{ffmpeg::compose_ffmpeg_pipe, into_vec, list_index, regex};
|
||||
use cfg_if::cfg_if;
|
||||
use ffmpeg_next::format::Pixel;
|
||||
use itertools::chain;
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{borrow::Cow, cmp, fmt::Display, path::PathBuf};
|
||||
use thiserror::Error;
|
||||
|
@ -327,7 +327,7 @@ impl Encoder {
|
|||
}
|
||||
}
|
||||
|
||||
/// Returns function pointer used for matching q/crf arguments in command line
|
||||
/// Returns function pointer used for matching Q/CRF arguments in command line
|
||||
fn q_match_fn(self) -> fn(&str) -> bool {
|
||||
match self {
|
||||
Self::aom | Self::vpx => |p| p.starts_with("--cq-level="),
|
||||
|
@ -356,22 +356,35 @@ impl Encoder {
|
|||
new_params
|
||||
}
|
||||
|
||||
/// Retuns regex for matching encoder progress in cli
|
||||
fn pipe_match(self) -> &'static Regex {
|
||||
match self {
|
||||
Self::aom | Self::vpx => regex!(r".*Pass (?:1/1|2/2) .*frame.*?/([^ ]+?) "),
|
||||
/// Parses the number of encoded frames
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// The caller should not attempt to read the contents of `line` after
|
||||
/// this function has been called.
|
||||
pub(crate) unsafe fn parse_encoded_frames(self, line: &mut str) -> Option<u64> {
|
||||
let regex = match self {
|
||||
Self::aom | Self::vpx => {
|
||||
cfg_if! {
|
||||
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
|
||||
if is_x86_feature_detected!("sse4.1") && is_x86_feature_detected!("ssse3") {
|
||||
return crate::simd::parse_aom_vpx_frames_sse41(line.as_bytes_mut());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The numbers for aomenc/vpxenc are buffered/encoded frames, so we want the
|
||||
// second number (actual encoded frames)
|
||||
regex!(r".*Pass (?:1/1|2/2) .*frame.*?/([^ ]+?) ")
|
||||
}
|
||||
Self::rav1e => regex!(r"encoded.*? ([^ ]+?) "),
|
||||
Self::svt_av1 => regex!(r"Encoding frame\s+(\d+)"),
|
||||
Self::x264 => regex!(r"^[^\d]*(\d+)"),
|
||||
Self::x265 => regex!(r"(\d+) frames"),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Returs option of q/crf value from cli encoder output
|
||||
pub fn match_line(self, line: &str) -> Option<usize> {
|
||||
let encoder_regex = self.pipe_match();
|
||||
let captures = encoder_regex.captures(line)?.get(1)?.as_str();
|
||||
captures.parse::<usize>().ok()
|
||||
let captures = regex.captures(line)?.get(1)?.as_str();
|
||||
captures.parse().ok()
|
||||
}
|
||||
|
||||
/// Returns command used for target quality probing
|
||||
|
|
|
@ -47,6 +47,7 @@ pub mod ffmpeg;
|
|||
pub mod progress_bar;
|
||||
pub mod scene_detect;
|
||||
pub mod settings;
|
||||
pub(crate) mod simd;
|
||||
pub mod split;
|
||||
pub mod target_quality;
|
||||
pub mod util;
|
||||
|
|
|
@ -36,6 +36,7 @@ fn pretty_progress_style() -> ProgressStyle {
|
|||
/// Enables steady 100 ms tick
|
||||
pub fn init_progress_bar(len: u64) {
|
||||
let pb = PROGRESS_BAR.get_or_init(|| ProgressBar::new(len).with_style(pretty_progress_style()));
|
||||
pb.set_draw_target(ProgressDrawTarget::stderr());
|
||||
pb.enable_steady_tick(100);
|
||||
pb.reset();
|
||||
pb.reset_eta();
|
||||
|
|
|
@ -312,11 +312,18 @@ impl EncodeArgs {
|
|||
break;
|
||||
}
|
||||
|
||||
if let Ok(line) = simdutf8::basic::from_utf8(&buf) {
|
||||
if let Ok(line) = simdutf8::basic::from_utf8_mut(&mut buf) {
|
||||
if self.verbosity == Verbosity::Verbose && !line.contains('\n') {
|
||||
update_mp_msg(worker_id, line.to_string());
|
||||
}
|
||||
if let Some(new) = self.encoder.match_line(line) {
|
||||
// This needs to be done before parse_encoded_frames, as it potentially
|
||||
// mutates the string
|
||||
enc_stderr.push_str(line);
|
||||
enc_stderr.push('\n');
|
||||
|
||||
// SAFETY: we do not read the contents of `line` after this function call
|
||||
if let Some(new) = unsafe { self.encoder.parse_encoded_frames(line) } {
|
||||
drop(line);
|
||||
if new > frame {
|
||||
if self.verbosity == Verbosity::Normal {
|
||||
update_bar((new - frame) as u64);
|
||||
|
@ -326,8 +333,6 @@ impl EncodeArgs {
|
|||
frame = new;
|
||||
}
|
||||
}
|
||||
enc_stderr.push_str(line);
|
||||
enc_stderr.push('\n');
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
|
@ -898,7 +903,7 @@ properly into a mkv file. Specify mkvmerge as the concatenation method by settin
|
|||
self.workers = cmp::min(self.workers, chunk_queue.len());
|
||||
|
||||
eprintln!(
|
||||
"{}{} {} {}{} {} {}{} {}\n{}: {}\n",
|
||||
"{}{} {} {}{} {} {}{} {}\n{}: {}",
|
||||
Color::Green.bold().paint("Q"),
|
||||
Color::Green.paint("ueue"),
|
||||
Color::Green.bold().paint(chunk_queue.len().to_string()),
|
||||
|
|
250
av1an-core/src/simd.rs
Normal file
250
av1an-core/src/simd.rs
Normal file
|
@ -0,0 +1,250 @@
|
|||
//! SIMD-optimized functions for parsing
|
||||
|
||||
/// x86 SIMD implementation of parsing aomenc/vpxenc output using
|
||||
/// SSSE3+SSE4.1, returning the number of frames processed, or `None`
|
||||
/// if the input did not match.
|
||||
///
|
||||
/// This function also works for parsing vpxenc output, as its progress
|
||||
/// printing is exactly the same.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// The caller should not attempt to read the contents of `s` after this
|
||||
/// function has been called.
|
||||
#[inline]
|
||||
#[target_feature(enable = "ssse3,sse4.1")]
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
pub(crate) unsafe fn parse_aom_vpx_frames_sse41(s: &mut [u8]) -> Option<u64> {
|
||||
#[cfg(target_arch = "x86")]
|
||||
use std::arch::x86::*;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
use std::arch::x86_64::*;
|
||||
|
||||
use std::mem::transmute;
|
||||
|
||||
// This implementation matches the *second* number in the output. Ex:
|
||||
// Pass 1/1 frame 142/141 156465B 208875 us 679.83 fps [ETA unknown]
|
||||
// ^^^
|
||||
// matches this number and returns `Some(141)`
|
||||
//
|
||||
// Pass 1/1 frame 102262/102261 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F
|
||||
// ^^^^^^
|
||||
// matches this number and returns `Some(102261)`
|
||||
//
|
||||
// If invalid input is detected, this function returns `None`.
|
||||
// We cheat in this implementation by taking a mutable slice to the string
|
||||
// so we can reuse its allocation to add padding zeroes for free.
|
||||
|
||||
// Number of bytes processed (size in bytes of xmm register)
|
||||
//
|
||||
// There is no benefit to using wider SIMD lanes in this case, so we just
|
||||
// use the most commonly available SIMD width. This is because we want
|
||||
// to parse the fewest number of bytes possible to get the correct result.
|
||||
const CHUNK_SIZE: usize = 16;
|
||||
|
||||
// We can safely always ignore this prefix, as the second number will
|
||||
// always be at some point after this prefix. See examples of aomenc
|
||||
// output below to see why this is the case.
|
||||
#[rustfmt::skip]
|
||||
const IGNORED_PREFIX: &str =
|
||||
"Pass x/x frame x/";
|
||||
// Pass 1/1 frame 3/2 2131B 5997 us 500.25 fps [ETA unknown]
|
||||
// ^ relevant output starts at this character
|
||||
// Pass 1/1 frame 84/83 81091B 132314 us 634.85 fps [ETA unknown]
|
||||
// ^
|
||||
// Pass 1/1 frame 142/141 156465B 208875 us 679.83 fps [ETA unknown]
|
||||
// ^
|
||||
// Pass 1/1 frame 4232/4231 5622510B 5518075 us 766.93 fps [ETA unknown]
|
||||
// ^
|
||||
// Pass 1/1 frame 13380/13379 17860525B 16760 ms 798.31 fps [ETA unknown]
|
||||
// ^
|
||||
// Pass 1/1 frame 102262/102261 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F
|
||||
// ^
|
||||
// Pass 1/1 frame 1022621/1022611 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F
|
||||
// ^
|
||||
//
|
||||
// As you can see, the relevant part of the output always starts past
|
||||
// the length of the ignored prefix.
|
||||
|
||||
// This implementation needs to read `CHUNK_SIZE` bytes past the ignored
|
||||
// prefix, so we pay the cost of the bounds check only once at the start
|
||||
// of this function. This also serves as an input validation check.
|
||||
if s.len() < IGNORED_PREFIX.len() + CHUNK_SIZE {
|
||||
return None;
|
||||
}
|
||||
// Sanity check to see if we should parse the line. Processing invalid input
|
||||
// anyway would result in returning a garbage value, ultimately causing the
|
||||
// frame counter to be completely off.
|
||||
if !(s.starts_with(b"Pass 2/2") || s.starts_with(b"Pass 1/1")) {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Since the aomenc output follows a particular pattern, we can calculate the
|
||||
// position of the '/' character from the index of the first space (how to
|
||||
// do so is explained later on). We create this mask to find the first space
|
||||
// in the output.
|
||||
let spaces = _mm_set1_epi8(b' ' as i8);
|
||||
|
||||
// Load the relevant part of the output, which are the 16 bytes after the ignored prefix.
|
||||
// This is safe because we already asserted that at least `IGNORED_PREFIX.len() + CHUNK_SIZE`
|
||||
// bytes are available, and `_mm_loadu_si128` loads `CHUNK_SIZE` (16) bytes.
|
||||
let relevant_output =
|
||||
_mm_loadu_si128(s.get_unchecked(IGNORED_PREFIX.len()..).as_ptr() as *const _);
|
||||
|
||||
// Compare the relevant output to spaces to create a mask where each bit
|
||||
// is set to 1 if the corresponding character was a space, and 0 otherwise.
|
||||
// The LSB corresponds to the match between the first characters.
|
||||
//
|
||||
// Only the lower 16 bits are relevant, as the rest are always set to 0.
|
||||
let mask16 = _mm_movemask_epi8(_mm_cmpeq_epi8(relevant_output, spaces));
|
||||
|
||||
// The bits in the mask are set as so:
|
||||
//
|
||||
// "141 156465B 208875 us 679.83 fps [ETA unknown]"
|
||||
// mask: 110000000111000
|
||||
// ^^^
|
||||
// These bits correspond to the first 3 characters: "141".
|
||||
// Since they do not match the space, they are set to 0 in the mask.
|
||||
// As printed, the leftmost bit is the most significant bit.
|
||||
// ^^^
|
||||
// These bits correspond to the 3 spaces after the "141".
|
||||
//
|
||||
// "2/102261 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F"
|
||||
// mask: 100000000
|
||||
// ^^^^^^^^
|
||||
// These bits correspond to the first 8 characters: "2/102261".
|
||||
//
|
||||
// To get the index of the first space, we need to get the trailing zeros,
|
||||
// which correspond to the first characters.
|
||||
//
|
||||
// This value is such that `relevant_output[first_space_index]` gives the
|
||||
// actual first space character.
|
||||
let first_space_index = mask16.trailing_zeros() as usize;
|
||||
|
||||
// It is impossible that `first_space_index == 0` for valid aomenc output, since the
|
||||
// first character after the ignored prefix has to be a digit.
|
||||
//
|
||||
// All indexes are relative to `relevant_output`.
|
||||
// ↓ Since the first digit occurs here, its index = 0
|
||||
// Pass 1/1 frame 3/2 2131B 5997 us 500.25 fps [ETA unknown]
|
||||
// * ^ n = 1, first_digit_index = 0 (the first character is the first digit)
|
||||
// ↑
|
||||
// ╰ end of ignored prefix (continued below)
|
||||
//
|
||||
// Pass 1/1 frame 84/83 81091B 132314 us 634.85 fps [ETA unknown]
|
||||
// * ^ n = 2, first_digit_index = 0
|
||||
//
|
||||
// Pass 1/1 frame 142/141 156465B 208875 us 679.83 fps [ETA unknown]
|
||||
// * ^ n = 3, first_digit_index = 0
|
||||
//
|
||||
// Pass 1/1 frame 4232/4231 5622510B 5518075 us 766.93 fps [ETA unknown]
|
||||
// * ^ n = 4, first_digit_index = 0
|
||||
//
|
||||
// Pass 1/1 frame 13380/13379 17860525B 16760 ms 798.31 fps [ETA unknown]
|
||||
// * ^ n = 6, first_digit_index = 1
|
||||
//
|
||||
// Pass 1/1 frame 102262/102261 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F
|
||||
// * ^ n = 8, first_digit_index = 2
|
||||
// ^^^^^^^^
|
||||
// 12345678
|
||||
// n = 8 signifies that there are 8 characters before the first space.
|
||||
// This also happens to be first_space_index.
|
||||
//
|
||||
// Pass 1/1 frame 1022621/1022611 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F
|
||||
// * ^ n = 10, first_digit_index = 3
|
||||
//
|
||||
// Solving a linear equation for n and first_digit_index yields this
|
||||
// formula, but first_digit_index cannot be negative so we use saturating_sub.
|
||||
let first_digit_index = (first_space_index / 2).saturating_sub(2);
|
||||
|
||||
// Set `CHUNK_SIZE` bytes before the real first digit index (including the ignored prefix)
|
||||
// to b'0'. Uncoditionally zeroing `CHUNK_SIZE` bytes is better than only setting the
|
||||
// bytes that are absolutely necessary because using a fixed size allows LLVM to avoid
|
||||
// a call to memset and instead use movaps/movups.
|
||||
for byte in s
|
||||
.get_unchecked_mut(IGNORED_PREFIX.len() + first_digit_index - CHUNK_SIZE..)
|
||||
.get_unchecked_mut(..CHUNK_SIZE)
|
||||
{
|
||||
*byte = b'0';
|
||||
}
|
||||
|
||||
// At this point, we have done all the setup and can use the actual SIMD integer
|
||||
// parsing algorithm. The description of the algorithm can be found here:
|
||||
// https://kholdstare.github.io/technical/2020/05/26/faster-integer-parsing.html
|
||||
let mut chunk = _mm_loadu_si128(
|
||||
s.as_ptr()
|
||||
.add(IGNORED_PREFIX.len() + first_space_index - CHUNK_SIZE) as *const _,
|
||||
);
|
||||
|
||||
let zeros = _mm_set1_epi8(b'0' as i8);
|
||||
chunk = _mm_sub_epi8(chunk, zeros);
|
||||
|
||||
let mult = _mm_set_epi8(1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10);
|
||||
chunk = _mm_maddubs_epi16(chunk, mult);
|
||||
let mult = _mm_set_epi16(1, 100, 1, 100, 1, 100, 1, 100);
|
||||
chunk = _mm_madd_epi16(chunk, mult);
|
||||
chunk = _mm_packus_epi32(chunk, chunk);
|
||||
let mult = _mm_set_epi16(0, 0, 0, 0, 1, 10000, 1, 10000);
|
||||
chunk = _mm_madd_epi16(chunk, mult);
|
||||
|
||||
let chunk = transmute::<_, [u64; 2]>(chunk);
|
||||
|
||||
Some(((chunk[0] & 0xffffffff) * 100000000) + (chunk[0] >> 32))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::simd::parse_aom_vpx_frames_sse41;
|
||||
|
||||
#[test]
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
fn aom_vpx_sse41_parsing() {
|
||||
let test_cases = [
|
||||
(
|
||||
"Pass 1/1 frame 3/2 2131B 5997 us 500.25 fps [ETA unknown]",
|
||||
Some(2),
|
||||
),
|
||||
(
|
||||
"Pass 2/2 frame 84/83 81091B 132314 us 634.85 fps [ETA unknown]",
|
||||
Some(83),
|
||||
),
|
||||
(
|
||||
"Pass 1/1 frame 142/141 156465B 208875 us 679.83 fps [ETA unknown]",
|
||||
Some(141),
|
||||
),
|
||||
(
|
||||
"Pass 1/2 frame 142/141 156465B 208875 us 679.83 fps [ETA unknown]",
|
||||
None,
|
||||
),
|
||||
(
|
||||
"Pass 2/2 frame 4232/4231 5622510B 5518075 us 766.93 fps [ETA unknown]",
|
||||
Some(4231),
|
||||
),
|
||||
(
|
||||
"Pass 1/1 frame 13380/13379 17860525B 16760 ms 798.31 fps [ETA unknown]",
|
||||
Some(13379),
|
||||
),
|
||||
(
|
||||
"Pass 1/2 frame 13380/13379 17860525B 16760 ms 798.31 fps [ETA unknown]",
|
||||
None,
|
||||
),
|
||||
("invalid data", None),
|
||||
(
|
||||
"Pass 2/2 frame 102262/102261 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F",
|
||||
Some(102261),
|
||||
),
|
||||
(
|
||||
"Pass 1/1 frame 1022621/1022611 136473850B 131502 ms 777.65 fps [ETA unknown] 1272F",
|
||||
Some(1022611),
|
||||
),
|
||||
];
|
||||
|
||||
if is_x86_feature_detected!("sse4.1") {
|
||||
for (s, ans) in test_cases {
|
||||
let mut s = String::from(s);
|
||||
|
||||
assert_eq!(unsafe { parse_aom_vpx_frames_sse41(s.as_bytes_mut()) }, ans);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue