vhost: introduce async enqueue registration API

Performing large memory copies usually takes up a major part of CPU
cycles and becomes the hot spot in vhost-user enqueue operation. To
offload the large copies from CPU to the DMA devices, asynchronous
APIs are introduced, with which the CPU just submits copy jobs to
the DMA but without waiting for its copy completion. Thus, there is
no CPU intervention during data transfer. We can save precious CPU
cycles and improve the overall throughput for vhost-user based
applications. This patch introduces registration/un-registration
APIs for vhost async data enqueue operation. Together with the
registration APIs implementations, data structures and the prototype
of the async callback functions required for async enqueue data path
are also defined.

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
Reviewed-by: Chenbo Xia <chenbo.xia@intel.com>
This commit is contained in:
Patrick Fu 2020-07-07 13:07:08 +08:00 committed by Ferruh Yigit
parent cd8b4be013
commit 78639d5456
9 changed files with 344 additions and 7 deletions

View file

@ -42,7 +42,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
# install includes
SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h \
rte_vdpa_dev.h
rte_vdpa_dev.h rte_vhost_async.h
# only compile vhost crypto when cryptodev is enabled
ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)

View file

@ -22,5 +22,5 @@ sources = files('fd_man.c', 'iotlb.c', 'socket.c', 'vdpa.c',
'vhost.c', 'vhost_user.c',
'virtio_net.c', 'vhost_crypto.c')
headers = files('rte_vhost.h', 'rte_vdpa.h', 'rte_vdpa_dev.h',
'rte_vhost_crypto.h')
'rte_vhost_crypto.h', 'rte_vhost_async.h')
deps += ['ethdev', 'cryptodev', 'hash', 'pci']

View file

@ -35,6 +35,7 @@ extern "C" {
#define RTE_VHOST_USER_EXTBUF_SUPPORT (1ULL << 5)
/* support only linear buffers (no chained mbufs) */
#define RTE_VHOST_USER_LINEARBUF_SUPPORT (1ULL << 6)
#define RTE_VHOST_USER_ASYNC_COPY (1ULL << 7)
/* Features. */
#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE

View file

@ -0,0 +1,136 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2020 Intel Corporation
*/
#ifndef _RTE_VHOST_ASYNC_H_
#define _RTE_VHOST_ASYNC_H_
#include "rte_vhost.h"
/**
* iovec iterator
*/
struct rte_vhost_iov_iter {
/** offset to the first byte of interesting data */
size_t offset;
/** total bytes of data in this iterator */
size_t count;
/** pointer to the iovec array */
struct iovec *iov;
/** number of iovec in this iterator */
unsigned long nr_segs;
};
/**
* dma transfer descriptor pair
*/
struct rte_vhost_async_desc {
/** source memory iov_iter */
struct rte_vhost_iov_iter *src;
/** destination memory iov_iter */
struct rte_vhost_iov_iter *dst;
};
/**
* dma transfer status
*/
struct rte_vhost_async_status {
/** An array of application specific data for source memory */
uintptr_t *src_opaque_data;
/** An array of application specific data for destination memory */
uintptr_t *dst_opaque_data;
};
/**
* dma operation callbacks to be implemented by applications
*/
struct rte_vhost_async_channel_ops {
/**
* instruct async engines to perform copies for a batch of packets
*
* @param vid
* id of vhost device to perform data copies
* @param queue_id
* queue id to perform data copies
* @param descs
* an array of DMA transfer memory descriptors
* @param opaque_data
* opaque data pair sending to DMA engine
* @param count
* number of elements in the "descs" array
* @return
* -1 on failure, number of descs processed on success
*/
int (*transfer_data)(int vid, uint16_t queue_id,
struct rte_vhost_async_desc *descs,
struct rte_vhost_async_status *opaque_data,
uint16_t count);
/**
* check copy-completed packets from the async engine
* @param vid
* id of vhost device to check copy completion
* @param queue_id
* queue id to check copyp completion
* @param opaque_data
* buffer to receive the opaque data pair from DMA engine
* @param max_packets
* max number of packets could be completed
* @return
* -1 on failure, number of iov segments completed on success
*/
int (*check_completed_copies)(int vid, uint16_t queue_id,
struct rte_vhost_async_status *opaque_data,
uint16_t max_packets);
};
/**
* dma channel feature bit definition
*/
struct rte_vhost_async_features {
union {
uint32_t intval;
struct {
uint32_t async_inorder:1;
uint32_t resvd_0:15;
uint32_t async_threshold:12;
uint32_t resvd_1:4;
};
};
};
/**
* register a async channel for vhost
*
* @param vid
* vhost device id async channel to be attached to
* @param queue_id
* vhost queue id async channel to be attached to
* @param features
* DMA channel feature bit
* b0 : DMA supports inorder data transfer
* b1 - b15: reserved
* b16 - b27: Packet length threshold for DMA transfer
* b28 - b31: reserved
* @param ops
* DMA operation callbacks
* @return
* 0 on success, -1 on failures
*/
__rte_experimental
int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
uint32_t features, struct rte_vhost_async_channel_ops *ops);
/**
* unregister a dma channel for vhost
*
* @param vid
* vhost device id DMA channel to be detached
* @param queue_id
* vhost queue id DMA channel to be detached
* @return
* 0 on success, -1 on failures
*/
__rte_experimental
int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id);
#endif /* _RTE_VHOST_ASYNC_H_ */

View file

@ -71,4 +71,8 @@ EXPERIMENTAL {
rte_vdpa_get_queue_num;
rte_vdpa_get_features;
rte_vdpa_get_protocol_features;
rte_vhost_async_channel_register;
rte_vhost_async_channel_unregister;
rte_vhost_submit_enqueue_burst;
rte_vhost_poll_enqueue_completed;
};

View file

@ -42,6 +42,7 @@ struct vhost_user_socket {
bool use_builtin_virtio_net;
bool extbuf;
bool linearbuf;
bool async_copy;
/*
* The "supported_features" indicates the feature bits the
@ -205,6 +206,7 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
size_t size;
struct vhost_user_connection *conn;
int ret;
struct virtio_net *dev;
if (vsocket == NULL)
return;
@ -236,6 +238,13 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
if (vsocket->linearbuf)
vhost_enable_linearbuf(vid);
if (vsocket->async_copy) {
dev = get_device(vid);
if (dev)
dev->async_copy = 1;
}
VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
if (vsocket->notify_ops->new_connection) {
@ -881,6 +890,17 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
goto out_mutex;
}
vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
if (vsocket->async_copy &&
(flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
VHOST_LOG_CONFIG(ERR, "error: enabling async copy and IOMMU "
"or post-copy feature simultaneously is not "
"supported\n");
goto out_mutex;
}
/*
* Set the supported features correctly for the builtin vhost-user
* net driver.
@ -931,6 +951,13 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
}
if (vsocket->async_copy) {
vsocket->supported_features &= ~(1ULL << VHOST_F_LOG_ALL);
vsocket->features &= ~(1ULL << VHOST_F_LOG_ALL);
VHOST_LOG_CONFIG(INFO,
"Logging feature is disabled in async copy mode\n");
}
/*
* We'll not be able to receive a buffer from guest in linear mode
* without external buffer if it will not fit in a single mbuf, which is

View file

@ -329,8 +329,13 @@ free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq)
{
if (vq_is_packed(dev))
rte_free(vq->shadow_used_packed);
else
else {
rte_free(vq->shadow_used_split);
if (vq->async_pkts_pending)
rte_free(vq->async_pkts_pending);
if (vq->async_pending_info)
rte_free(vq->async_pending_info);
}
rte_free(vq->batch_copy_elems);
rte_mempool_free(vq->iotlb_pool);
rte_free(vq);
@ -1509,5 +1514,124 @@ int rte_vhost_extern_callback_register(int vid,
return 0;
}
int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
uint32_t features,
struct rte_vhost_async_channel_ops *ops)
{
struct vhost_virtqueue *vq;
struct virtio_net *dev = get_device(vid);
struct rte_vhost_async_features f;
if (dev == NULL || ops == NULL)
return -1;
f.intval = features;
vq = dev->virtqueue[queue_id];
if (unlikely(vq == NULL || !dev->async_copy))
return -1;
/* packed queue is not supported */
if (unlikely(vq_is_packed(dev) || !f.async_inorder)) {
VHOST_LOG_CONFIG(ERR,
"async copy is not supported on packed queue or non-inorder mode "
"(vid %d, qid: %d)\n", vid, queue_id);
return -1;
}
if (unlikely(ops->check_completed_copies == NULL ||
ops->transfer_data == NULL))
return -1;
rte_spinlock_lock(&vq->access_lock);
if (unlikely(vq->async_registered)) {
VHOST_LOG_CONFIG(ERR,
"async register failed: channel already registered "
"(vid %d, qid: %d)\n", vid, queue_id);
goto reg_out;
}
vq->async_pkts_pending = rte_malloc(NULL,
vq->size * sizeof(uintptr_t),
RTE_CACHE_LINE_SIZE);
vq->async_pending_info = rte_malloc(NULL,
vq->size * sizeof(uint64_t),
RTE_CACHE_LINE_SIZE);
if (!vq->async_pkts_pending || !vq->async_pending_info) {
if (vq->async_pkts_pending)
rte_free(vq->async_pkts_pending);
if (vq->async_pending_info)
rte_free(vq->async_pending_info);
VHOST_LOG_CONFIG(ERR,
"async register failed: cannot allocate memory for vq data "
"(vid %d, qid: %d)\n", vid, queue_id);
goto reg_out;
}
vq->async_ops.check_completed_copies = ops->check_completed_copies;
vq->async_ops.transfer_data = ops->transfer_data;
vq->async_inorder = f.async_inorder;
vq->async_threshold = f.async_threshold;
vq->async_registered = true;
reg_out:
rte_spinlock_unlock(&vq->access_lock);
return 0;
}
int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
{
struct vhost_virtqueue *vq;
struct virtio_net *dev = get_device(vid);
int ret = -1;
if (dev == NULL)
return ret;
vq = dev->virtqueue[queue_id];
if (vq == NULL)
return ret;
ret = 0;
rte_spinlock_lock(&vq->access_lock);
if (!vq->async_registered)
goto out;
if (vq->async_pkts_inflight_n) {
VHOST_LOG_CONFIG(ERR, "Failed to unregister async channel. "
"async inflight packets must be completed before unregistration.\n");
ret = -1;
goto out;
}
if (vq->async_pkts_pending) {
rte_free(vq->async_pkts_pending);
vq->async_pkts_pending = NULL;
}
if (vq->async_pending_info) {
rte_free(vq->async_pending_info);
vq->async_pending_info = NULL;
}
vq->async_ops.transfer_data = NULL;
vq->async_ops.check_completed_copies = NULL;
vq->async_registered = false;
out:
rte_spinlock_unlock(&vq->access_lock);
return ret;
}
RTE_LOG_REGISTER(vhost_config_log_level, lib.vhost.config, INFO);
RTE_LOG_REGISTER(vhost_data_log_level, lib.vhost.data, WARNING);

View file

@ -24,6 +24,8 @@
#include "rte_vdpa.h"
#include "rte_vdpa_dev.h"
#include "rte_vhost_async.h"
/* Used to indicate that the device is running on a data core */
#define VIRTIO_DEV_RUNNING 1
/* Used to indicate that the device is ready to operate */
@ -40,6 +42,11 @@
#define VHOST_LOG_CACHE_NR 32
#define MAX_PKT_BURST 32
#define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
#define PACKED_DESC_ENQUEUE_USED_FLAG(w) \
((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \
VRING_DESC_F_WRITE)
@ -202,6 +209,25 @@ struct vhost_virtqueue {
TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
int iotlb_cache_nr;
TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
/* operation callbacks for async dma */
struct rte_vhost_async_channel_ops async_ops;
struct rte_vhost_iov_iter it_pool[VHOST_MAX_ASYNC_IT];
struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
/* async data transfer status */
uintptr_t **async_pkts_pending;
#define ASYNC_PENDING_INFO_N_MSK 0xFFFF
#define ASYNC_PENDING_INFO_N_SFT 16
uint64_t *async_pending_info;
uint16_t async_pkts_idx;
uint16_t async_pkts_inflight_n;
/* vq async features */
bool async_inorder;
bool async_registered;
uint16_t async_threshold;
} __rte_cache_aligned;
#define VHOST_MAX_VRING 0x100
@ -338,6 +364,7 @@ struct virtio_net {
int16_t broadcast_rarp;
uint32_t nr_vring;
int dequeue_zero_copy;
int async_copy;
int extbuf;
int linearbuf;
struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
@ -683,7 +710,8 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
/* Don't kick guest if we don't reach index specified by guest. */
if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
uint16_t old = vq->signalled_used;
uint16_t new = vq->last_used_idx;
uint16_t new = vq->async_pkts_inflight_n ?
vq->used->idx:vq->last_used_idx;
bool signalled_used_valid = vq->signalled_used_valid;
vq->signalled_used = new;

View file

@ -476,12 +476,14 @@ vhost_user_set_vring_num(struct virtio_net **pdev,
} else {
if (vq->shadow_used_split)
rte_free(vq->shadow_used_split);
vq->shadow_used_split = rte_malloc(NULL,
vq->size * sizeof(struct vring_used_elem),
RTE_CACHE_LINE_SIZE);
if (!vq->shadow_used_split) {
VHOST_LOG_CONFIG(ERR,
"failed to allocate memory for shadow used ring.\n");
"failed to allocate memory for vq internal data.\n");
return RTE_VHOST_MSG_RESULT_ERR;
}
}
@ -1166,7 +1168,8 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
goto err_mmap;
}
populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0;
populate = (dev->dequeue_zero_copy || dev->async_copy) ?
MAP_POPULATE : 0;
mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
MAP_SHARED | populate, fd, 0);
@ -1181,7 +1184,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
mmap_offset;
if (dev->dequeue_zero_copy)
if (dev->dequeue_zero_copy || dev->async_copy)
if (add_guest_pages(dev, reg, alignment) < 0) {
VHOST_LOG_CONFIG(ERR,
"adding guest pages to region %u failed.\n",
@ -1979,6 +1982,12 @@ vhost_user_get_vring_base(struct virtio_net **pdev,
} else {
rte_free(vq->shadow_used_split);
vq->shadow_used_split = NULL;
if (vq->async_pkts_pending)
rte_free(vq->async_pkts_pending);
if (vq->async_pending_info)
rte_free(vq->async_pending_info);
vq->async_pkts_pending = NULL;
vq->async_pending_info = NULL;
}
rte_free(vq->batch_copy_elems);
@ -2012,6 +2021,14 @@ vhost_user_set_vring_enable(struct virtio_net **pdev,
"set queue enable: %d to qp idx: %d\n",
enable, index);
if (!enable && dev->virtqueue[index]->async_registered) {
if (dev->virtqueue[index]->async_pkts_inflight_n) {
VHOST_LOG_CONFIG(ERR, "failed to disable vring. "
"async inflight packets must be completed first\n");
return RTE_VHOST_MSG_RESULT_ERR;
}
}
/* On disable, rings have to be stopped being processed. */
if (!enable && dev->dequeue_zero_copy)
drain_zmbuf_list(dev->virtqueue[index]);