block-6.16-20250626

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmhd4zsQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpmZgEACOk81RNf8WGNQf4/parSENzebWNj9W+fKD
 RDhWxwBAquT2VzkF8Iu6wbteVbP9A8yq4BagbD079OWrr0iV8NgWA5y1GyqdER6N
 upe2ZtBlY7RR4F1FerpSGqRBbhWYejNojSr073ea8mmx5Yl0BbHz5aKKmzWGbUYO
 lveYPgCeL4dD7kfPeINiamhicLudyAGdqqYpG+/wriefwaVhTgCe+4aQ6pEwftRT
 utqCzrpUnxrmXS4TFXiWd4u3iVNwPhzcMyUrgkK1yTM7mWIqp8QyHzfF4Acbh/T3
 RN/8d5OCfYmamlRvDUCl3FXWukkdGtBrA4m51mhUIzRJ9Np9IiSHdd2UTDgGqSeG
 2NSjLtmdDQvtVXeuqBs56os7e3DFx42LZuceqbGWaTQ4VC4QE+Xz+n2ZENx/hWFZ
 /lixcIBdxt6iqjveJuBJeXW6UqaR+Hz4hpSigZU69DMQzrKm65bSoMdOvyn5b0bU
 GtlPusSnfgpsSe/H41Lm7SLBePiGXMJvhujzlkWW5cnUUl+yRUQhTO206kQJkbV1
 XUMs8Syow15gjQaXI9KiAq+MMUuUwOvXmptMyYQ1NjFy16yzhJ8QOhJilJLWfLdT
 SqsLyXn1kG2EdcPmXHJRthIgVmQ+uORy2JB1wAomyjJj9a16wJYhgCGDjrl4mocl
 9LpjfnyMsA==
 =ln4w
 -----END PGP SIGNATURE-----

Merge tag 'block-6.16-20250626' of git://git.kernel.dk/linux

Pull block fixes from Jens Axboe:

 - Fixes for ublk:
      - fix C++ narrowing warnings in the uapi header
      - update/improve UBLK_F_SUPPORT_ZERO_COPY comment in uapi header
      - fix for the ublk ->queue_rqs() implementation, limiting a batch
        to just the specific task AND ring
      - ublk_get_data() error handling fix
      - sanity check more arguments in ublk_ctrl_add_dev()
      - selftest addition

 - NVMe pull request via Christoph:
      - reset delayed remove_work after reconnect
      - fix atomic write size validation

 - Fix for a warning introduced in bdev_count_inflight_rw() in this
   merge window

* tag 'block-6.16-20250626' of git://git.kernel.dk/linux:
  block: fix false warning in bdev_count_inflight_rw()
  ublk: sanity check add_dev input for underflow
  nvme: fix atomic write size validation
  nvme: refactor the atomic write unit detection
  nvme: reset delayed remove_work after reconnect
  ublk: setup ublk_io correctly in case of ublk_get_data() failure
  ublk: update UBLK_F_SUPPORT_ZERO_COPY comment in UAPI header
  ublk: fix narrowing warnings in UAPI header
  selftests: ublk: don't take same backing file for more than one ublk devices
  ublk: build batch from IOs in same io_ring_ctx and io task
This commit is contained in:
Linus Torvalds 2025-06-27 09:02:33 -07:00
commit e540341508
7 changed files with 125 additions and 79 deletions

View File

@ -128,23 +128,27 @@ static void part_stat_read_all(struct block_device *part,
static void bdev_count_inflight_rw(struct block_device *part,
unsigned int inflight[2], bool mq_driver)
{
int write = 0;
int read = 0;
int cpu;
if (mq_driver) {
blk_mq_in_driver_rw(part, inflight);
} else {
for_each_possible_cpu(cpu) {
inflight[READ] += part_stat_local_read_cpu(
part, in_flight[READ], cpu);
inflight[WRITE] += part_stat_local_read_cpu(
part, in_flight[WRITE], cpu);
}
return;
}
if (WARN_ON_ONCE((int)inflight[READ] < 0))
inflight[READ] = 0;
if (WARN_ON_ONCE((int)inflight[WRITE] < 0))
inflight[WRITE] = 0;
for_each_possible_cpu(cpu) {
read += part_stat_local_read_cpu(part, in_flight[READ], cpu);
write += part_stat_local_read_cpu(part, in_flight[WRITE], cpu);
}
/*
* While iterating all CPUs, some IOs may be issued from a CPU already
* traversed and complete on a CPU that has not yet been traversed,
* causing the inflight number to be negative.
*/
inflight[READ] = read > 0 ? read : 0;
inflight[WRITE] = write > 0 ? write : 0;
}
/**

View File

@ -1148,8 +1148,8 @@ exit:
blk_mq_end_request(req, res);
}
static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
int res, unsigned issue_flags)
static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
struct request *req)
{
/* read cmd first because req will overwrite it */
struct io_uring_cmd *cmd = io->cmd;
@ -1164,6 +1164,13 @@ static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
io->req = req;
return cmd;
}
static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
int res, unsigned issue_flags)
{
struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
/* tell ublksrv one io request is coming */
io_uring_cmd_done(cmd, res, 0, issue_flags);
@ -1416,6 +1423,14 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
const struct ublk_io *io2)
{
return (io_uring_cmd_ctx_handle(io->cmd) ==
io_uring_cmd_ctx_handle(io2->cmd)) &&
(io->task == io2->task);
}
static void ublk_queue_rqs(struct rq_list *rqlist)
{
struct rq_list requeue_list = { };
@ -1427,7 +1442,8 @@ static void ublk_queue_rqs(struct rq_list *rqlist)
struct ublk_queue *this_q = req->mq_hctx->driver_data;
struct ublk_io *this_io = &this_q->ios[req->tag];
if (io && io->task != this_io->task && !rq_list_empty(&submit_list))
if (io && !ublk_belong_to_same_batch(io, this_io) &&
!rq_list_empty(&submit_list))
ublk_queue_cmd_list(io, &submit_list);
io = this_io;
@ -2148,10 +2164,9 @@ static int ublk_commit_and_fetch(const struct ublk_queue *ubq,
return 0;
}
static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io)
static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
struct request *req)
{
struct request *req = io->req;
/*
* We have handled UBLK_IO_NEED_GET_DATA command,
* so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
@ -2178,6 +2193,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
u32 cmd_op = cmd->cmd_op;
unsigned tag = ub_cmd->tag;
int ret = -EINVAL;
struct request *req;
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
__func__, cmd->cmd_op, ub_cmd->q_id, tag,
@ -2236,11 +2252,19 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
goto out;
break;
case UBLK_IO_NEED_GET_DATA:
io->addr = ub_cmd->addr;
if (!ublk_get_data(ubq, io))
return -EIOCBQUEUED;
return UBLK_IO_RES_OK;
/*
* ublk_get_data() may fail and fallback to requeue, so keep
* uring_cmd active first and prepare for handling new requeued
* request
*/
req = io->req;
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
if (likely(ublk_get_data(ubq, io, req))) {
__ublk_prep_compl_io_cmd(io, req);
return UBLK_IO_RES_OK;
}
break;
default:
goto out;
}
@ -2825,7 +2849,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
if (copy_from_user(&info, argp, sizeof(info)))
return -EFAULT;
if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || info.nr_hw_queues > UBLK_MAX_NR_QUEUES)
if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
return -EINVAL;
if (capable(CAP_SYS_ADMIN))

View File

@ -2015,21 +2015,41 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
}
static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns,
struct nvme_id_ns *id, struct queue_limits *lim,
u32 bs, u32 atomic_bs)
static u32 nvme_configure_atomic_write(struct nvme_ns *ns,
struct nvme_id_ns *id, struct queue_limits *lim, u32 bs)
{
unsigned int boundary = 0;
u32 atomic_bs, boundary = 0;
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
if (le16_to_cpu(id->nabspf))
/*
* We do not support an offset for the atomic boundaries.
*/
if (id->nabo)
return bs;
if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) {
/*
* Use the per-namespace atomic write unit when available.
*/
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
if (id->nabspf)
boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
} else {
/*
* Use the controller wide atomic write unit. This sucks
* because the limit is defined in terms of logical blocks while
* namespaces can have different formats, and because there is
* no clear language in the specification prohibiting different
* values for different controllers in the subsystem.
*/
atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
}
lim->atomic_write_hw_max = atomic_bs;
lim->atomic_write_hw_boundary = boundary;
lim->atomic_write_hw_unit_min = bs;
lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
lim->features |= BLK_FEAT_ATOMIC_WRITES;
return atomic_bs;
}
static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
@ -2067,34 +2087,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
valid = false;
}
atomic_bs = phys_bs = bs;
if (id->nabo == 0) {
/*
* Bit 1 indicates whether NAWUPF is defined for this namespace
* and whether it should be used instead of AWUPF. If NAWUPF ==
* 0 then AWUPF must be used instead.
*/
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
else
atomic_bs = (1 + ns->ctrl->awupf) * bs;
/*
* Set subsystem atomic bs.
*/
if (ns->ctrl->subsys->atomic_bs) {
if (atomic_bs != ns->ctrl->subsys->atomic_bs) {
dev_err_ratelimited(ns->ctrl->device,
"%s: Inconsistent Atomic Write Size, Namespace will not be added: Subsystem=%d bytes, Controller/Namespace=%d bytes\n",
ns->disk ? ns->disk->disk_name : "?",
ns->ctrl->subsys->atomic_bs,
atomic_bs);
}
} else
ns->ctrl->subsys->atomic_bs = atomic_bs;
nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs);
}
phys_bs = bs;
atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs);
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
/* NPWG = Namespace Preferred Write Granularity */
@ -2382,16 +2376,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (!nvme_update_disk_info(ns, id, &lim))
capacity = 0;
/*
* Validate the max atomic write size fits within the subsystem's
* atomic write capabilities.
*/
if (lim.atomic_write_hw_max > ns->ctrl->subsys->atomic_bs) {
blk_mq_unfreeze_queue(ns->disk->queue, memflags);
ret = -ENXIO;
goto out;
}
nvme_config_discard(ns, &lim);
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
ns->head->ids.csi == NVME_CSI_ZNS)
@ -3215,6 +3199,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
memcpy(subsys->model, id->mn, sizeof(subsys->model));
subsys->vendor_id = le16_to_cpu(id->vid);
subsys->cmic = id->cmic;
subsys->awupf = le16_to_cpu(id->awupf);
/* Versions prior to 1.4 don't necessarily report a valid type */
if (id->cntrltype == NVME_CTRL_DISC ||
@ -3552,6 +3537,15 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
if (ret)
goto out_free;
}
if (le16_to_cpu(id->awupf) != ctrl->subsys->awupf) {
dev_err_ratelimited(ctrl->device,
"inconsistent AWUPF, controller not added (%u/%u).\n",
le16_to_cpu(id->awupf), ctrl->subsys->awupf);
ret = -EINVAL;
goto out_free;
}
memcpy(ctrl->subsys->firmware_rev, id->fr,
sizeof(ctrl->subsys->firmware_rev));
@ -3647,7 +3641,6 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
dev_pm_qos_expose_latency_tolerance(ctrl->device);
else if (!ctrl->apst_enabled && prev_apst_enabled)
dev_pm_qos_hide_latency_tolerance(ctrl->device);
ctrl->awupf = le16_to_cpu(id->awupf);
out_free:
kfree(id);
return ret;
@ -4036,6 +4029,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
list_add_tail_rcu(&ns->siblings, &head->list);
ns->head = head;
mutex_unlock(&ctrl->subsys->lock);
#ifdef CONFIG_NVME_MULTIPATH
cancel_delayed_work(&head->remove_work);
#endif
return 0;
out_put_ns_head:

View File

@ -1311,7 +1311,7 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
*/
if (!try_module_get(THIS_MODULE))
goto out;
queue_delayed_work(nvme_wq, &head->remove_work,
mod_delayed_work(nvme_wq, &head->remove_work,
head->delayed_removal_secs * HZ);
} else {
list_del_init(&head->entry);

View File

@ -410,7 +410,6 @@ struct nvme_ctrl {
enum nvme_ctrl_type cntrltype;
enum nvme_dctype dctype;
u16 awupf; /* 0's based value. */
};
static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
@ -443,11 +442,11 @@ struct nvme_subsystem {
u8 cmic;
enum nvme_subsys_type subtype;
u16 vendor_id;
u16 awupf; /* 0's based value. */
struct ida ns_ida;
#ifdef CONFIG_NVME_MULTIPATH
enum nvme_iopolicy iopolicy;
#endif
u32 atomic_bs;
};
/*

View File

@ -135,8 +135,28 @@
#define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS)
/*
* zero copy requires 4k block size, and can remap ublk driver's io
* request into ublksrv's vm space
* ublk server can register data buffers for incoming I/O requests with a sparse
* io_uring buffer table. The request buffer can then be used as the data buffer
* for io_uring operations via the fixed buffer index.
* Note that the ublk server can never directly access the request data memory.
*
* To use this feature, the ublk server must first register a sparse buffer
* table on an io_uring instance.
* When an incoming ublk request is received, the ublk server submits a
* UBLK_U_IO_REGISTER_IO_BUF command to that io_uring instance. The
* ublksrv_io_cmd's q_id and tag specify the request whose buffer to register
* and addr is the index in the io_uring's buffer table to install the buffer.
* SQEs can now be submitted to the io_uring to read/write the request's buffer
* by enabling fixed buffers (e.g. using IORING_OP_{READ,WRITE}_FIXED or
* IORING_URING_CMD_FIXED) and passing the registered buffer index in buf_index.
* Once the last io_uring operation using the request's buffer has completed,
* the ublk server submits a UBLK_U_IO_UNREGISTER_IO_BUF command with q_id, tag,
* and addr again specifying the request buffer to unregister.
* The ublk request is completed when its buffer is unregistered from all
* io_uring instances and the ublk server issues UBLK_U_IO_COMMIT_AND_FETCH_REQ.
*
* Not available for UBLK_F_UNPRIVILEGED_DEV, as a ublk server can leak
* uninitialized kernel memory by not reading into the full request buffer.
*/
#define UBLK_F_SUPPORT_ZERO_COPY (1ULL << 0)
@ -450,10 +470,10 @@ static inline struct ublk_auto_buf_reg ublk_sqe_addr_to_auto_buf_reg(
__u64 sqe_addr)
{
struct ublk_auto_buf_reg reg = {
.index = sqe_addr & 0xffff,
.flags = (sqe_addr >> 16) & 0xff,
.reserved0 = (sqe_addr >> 24) & 0xff,
.reserved1 = sqe_addr >> 32,
.index = (__u16)sqe_addr,
.flags = (__u8)(sqe_addr >> 16),
.reserved0 = (__u8)(sqe_addr >> 24),
.reserved1 = (__u32)(sqe_addr >> 32),
};
return reg;

View File

@ -32,22 +32,23 @@ _create_backfile 2 128M
ublk_io_and_remove 8G -t null -q 4 -z &
ublk_io_and_remove 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" &
ublk_io_and_remove 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
wait
if _have_feature "AUTO_BUF_REG"; then
ublk_io_and_remove 8G -t null -q 4 --auto_zc &
ublk_io_and_remove 256M -t loop -q 4 --auto_zc "${UBLK_BACKFILES[0]}" &
ublk_io_and_remove 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback &
wait
fi
wait
if _have_feature "PER_IO_DAEMON"; then
ublk_io_and_remove 8G -t null -q 4 --auto_zc --nthreads 8 --per_io_tasks &
ublk_io_and_remove 256M -t loop -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" &
ublk_io_and_remove 256M -t stripe -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback --nthreads 8 --per_io_tasks &
wait
fi
wait
_cleanup_test "stress"
_show_result $TID $ERR_CODE