From 8c6ce8e86dd75db8e6c6a3e5a870e8d52dbab2d0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Apr 2025 12:40:28 -0400
Subject: [PATCH 01/48] attach_mnt(): expand in attach_recursive_mnt(), then
 lose the flag argument

simpler that way - all but one caller pass false as 'beneath' argument,
and that one caller is actually happier with the call expanded - the
logics with choice of mountpoint is identical for 'moving' and 'attaching'
cases, and now that is no longer hidden.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 37 ++++++++++++-------------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 54c59e091919..1761d2c2fdae 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1116,16 +1116,10 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent)
  * @parent:  the parent
  * @mnt:     the new mount
  * @mp:      the new mountpoint
- * @beneath: whether to mount @mnt beneath or on top of @parent
  *
- * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
+ * Mount @mnt at @mp on @parent. Then attach @mnt
  * to @parent's child mount list and to @mount_hashtable.
  *
- * If @beneath is true, remove @mnt from its current parent and
- * mountpoint and mount it on @mp on @parent, and mount @parent on the
- * old parent and old mountpoint of @mnt. Finally, attach @parent to
- * @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
- *
  * Note, when __attach_mnt() is called @mnt->mnt_parent already points
  * to the correct parent.
  *
@@ -1133,18 +1127,9 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent)
  *          to have been acquired in that order.
  */
 static void attach_mnt(struct mount *mnt, struct mount *parent,
-		       struct mountpoint *mp, bool beneath)
+		       struct mountpoint *mp)
 {
-	if (beneath)
-		mnt_set_mountpoint_beneath(mnt, parent, mp);
-	else
-		mnt_set_mountpoint(parent, mp, mnt);
-	/*
-	 * Note, @mnt->mnt_parent has to be used. If @mnt was mounted
-	 * beneath @parent then @mnt will need to be attached to
-	 * @parent's old parent, not @parent. IOW, @mnt->mnt_parent
-	 * isn't the same mount as @parent.
-	 */
+	mnt_set_mountpoint(parent, mp, mnt);
 	__attach_mnt(mnt, mnt->mnt_parent);
 }
 
@@ -1157,7 +1142,7 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
 	hlist_del_init(&mnt->mnt_mp_list);
 	hlist_del_init_rcu(&mnt->mnt_hash);
 
-	attach_mnt(mnt, parent, mp, false);
+	attach_mnt(mnt, parent, mp);
 
 	put_mountpoint(old_mp);
 	mnt_add_count(old_parent, -1);
@@ -2295,7 +2280,7 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
 				goto out;
 			lock_mount_hash();
 			list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
-			attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false);
+			attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp);
 			unlock_mount_hash();
 		}
 	}
@@ -2743,10 +2728,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	}
 
 	if (moving) {
-		if (beneath)
-			dest_mp = smp;
 		unhash_mnt(source_mnt);
-		attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
+		if (beneath)
+			mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
+		else
+			mnt_set_mountpoint(top_mnt, dest_mp, source_mnt);
+		__attach_mnt(source_mnt, source_mnt->mnt_parent);
 		mnt_notify_add(source_mnt);
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
@@ -4827,9 +4814,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
 	}
 	/* mount old root on put_old */
-	attach_mnt(root_mnt, old_mnt, old_mp, false);
+	attach_mnt(root_mnt, old_mnt, old_mp);
 	/* mount new_root on / */
-	attach_mnt(new_mnt, root_parent, root_mp, false);
+	attach_mnt(new_mnt, root_parent, root_mp);
 	mnt_add_count(root_parent, -1);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	/* A moved mount should not expire automatically */

From 431cc1d8e2dab751b2b8f298a6d9caf83d8b49c3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Apr 2025 22:18:12 -0400
Subject: [PATCH 02/48] get rid of mnt_set_mountpoint_beneath()

mnt_set_mountpoint_beneath() consists of attaching new mount side-by-side
with the one we want to mount beneath (by mnt_set_mountpoint()), followed
by mnt_change_mountpoint() shifting the the top mount onto the new one
(by mnt_change_mountpoint()).

Both callers of mnt_set_mountpoint_beneath (both in attach_recursive_mnt())
have the same form - in 'beneath' case we call mnt_set_mountpoint_beneath(),
otherwise - mnt_set_mountpoint().

The thing is, expressing that as unconditional mnt_set_mountpoint(),
followed, in 'beneath' case, by mnt_change_mountpoint() is just as easy.
And these mnt_change_mountpoint() callers are similar to the ones we
do when it comes to attaching propagated copies, which will allow more
cleanups in the next commits.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 37 ++++---------------------------------
 1 file changed, 4 insertions(+), 33 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 1761d2c2fdae..888816289154 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1076,33 +1076,6 @@ void mnt_set_mountpoint(struct mount *mnt,
 	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
 }
 
-/**
- * mnt_set_mountpoint_beneath - mount a mount beneath another one
- *
- * @new_parent: the source mount
- * @top_mnt:    the mount beneath which @new_parent is mounted
- * @new_mp:     the new mountpoint of @top_mnt on @new_parent
- *
- * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
- * parent @top_mnt->mnt_parent and mount it on top of @new_parent at
- * @new_mp. And mount @new_parent on the old parent and old
- * mountpoint of @top_mnt.
- *
- * Context: This function expects namespace_lock() and lock_mount_hash()
- *          to have been acquired in that order.
- */
-static void mnt_set_mountpoint_beneath(struct mount *new_parent,
-				       struct mount *top_mnt,
-				       struct mountpoint *new_mp)
-{
-	struct mount *old_top_parent = top_mnt->mnt_parent;
-	struct mountpoint *old_top_mp = top_mnt->mnt_mp;
-
-	mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent);
-	mnt_change_mountpoint(new_parent, new_mp, top_mnt);
-}
-
-
 static void __attach_mnt(struct mount *mnt, struct mount *parent)
 {
 	hlist_add_head_rcu(&mnt->mnt_hash,
@@ -2729,10 +2702,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 
 	if (moving) {
 		unhash_mnt(source_mnt);
+		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		if (beneath)
-			mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
-		else
-			mnt_set_mountpoint(top_mnt, dest_mp, source_mnt);
+			mnt_change_mountpoint(source_mnt, smp, top_mnt);
 		__attach_mnt(source_mnt, source_mnt->mnt_parent);
 		mnt_notify_add(source_mnt);
 		touch_mnt_namespace(source_mnt->mnt_ns);
@@ -2745,10 +2717,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 				move_from_ns(p, &head);
 			list_del_init(&head);
 		}
+		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		if (beneath)
-			mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
-		else
-			mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
+			mnt_change_mountpoint(source_mnt, smp, top_mnt);
 		commit_tree(source_mnt);
 	}
 

From ffdc52fbbd5835a936ad683c943d6d103a2d4514 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 20 Jun 2025 22:46:55 -0400
Subject: [PATCH 03/48] prevent mount hash conflicts

Currently it's still possible to run into a pathological situation when
two hashed mounts share both parent and mountpoint.  That does not work
well, for obvious reasons.

We are not far from getting rid of that; the only remaining gap is
attach_recursive_mnt() not being careful enough when sliding a tree
under existing mount (for propagated copies or in 'beneath' case for
the original one).

To deal with that cleanly we need to be able to find overmounts
(i.e. mounts on top of parent's root); we could do hash lookups or scan
the list of children but either would be costly.  Since one of the results
we get from that will be prevention of multiple parallel overmounts, let's
just bite the bullet and store a (non-counting) reference to overmount
in struct mount.

With that done, closing the hole in attach_recursive_mnt() becomes easy
- we just need to follow the chain of overmounts before we change the
mountpoint of the mount we are sliding things under.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mount.h     |  1 +
 fs/namespace.c | 27 ++++++++++++++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index ad7173037924..b8beafdd6d24 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -92,6 +92,7 @@ struct mount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	struct hlist_head mnt_pins;
 	struct hlist_head mnt_stuck_children;
+	struct mount *overmount;	/* mounted on ->mnt_root */
 } __randomize_layout;
 
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namespace.c b/fs/namespace.c
index 888816289154..9b732d74c2cc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1043,6 +1043,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 static struct mountpoint *unhash_mnt(struct mount *mnt)
 {
 	struct mountpoint *mp;
+	struct mount *parent = mnt->mnt_parent;
+	if (unlikely(parent->overmount == mnt))
+		parent->overmount = NULL;
 	mnt->mnt_parent = mnt;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	list_del_init(&mnt->mnt_child);
@@ -1078,6 +1081,8 @@ void mnt_set_mountpoint(struct mount *mnt,
 
 static void __attach_mnt(struct mount *mnt, struct mount *parent)
 {
+	if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root))
+		parent->overmount = mnt;
 	hlist_add_head_rcu(&mnt->mnt_hash,
 			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
 	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
@@ -2660,7 +2665,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	HLIST_HEAD(tree_list);
 	struct mnt_namespace *ns = top_mnt->mnt_ns;
 	struct mountpoint *smp;
+	struct mountpoint *secondary = NULL;
 	struct mount *child, *dest_mnt, *p;
+	struct mount *top;
 	struct hlist_node *n;
 	int err = 0;
 	bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;
@@ -2669,9 +2676,15 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	 * Preallocate a mountpoint in case the new mounts need to be
 	 * mounted beneath mounts on the same mountpoint.
 	 */
-	smp = get_mountpoint(source_mnt->mnt.mnt_root);
+	for (top = source_mnt; unlikely(top->overmount); top = top->overmount) {
+		if (!secondary && is_mnt_ns_file(top->mnt.mnt_root))
+			secondary = top->mnt_mp;
+	}
+	smp = get_mountpoint(top->mnt.mnt_root);
 	if (IS_ERR(smp))
 		return PTR_ERR(smp);
+	if (!secondary)
+		secondary = smp;
 
 	/* Is there space to add these mounts to the mount namespace? */
 	if (!moving) {
@@ -2704,7 +2717,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		unhash_mnt(source_mnt);
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		if (beneath)
-			mnt_change_mountpoint(source_mnt, smp, top_mnt);
+			mnt_change_mountpoint(top, smp, top_mnt);
 		__attach_mnt(source_mnt, source_mnt->mnt_parent);
 		mnt_notify_add(source_mnt);
 		touch_mnt_namespace(source_mnt->mnt_ns);
@@ -2719,7 +2732,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		}
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		if (beneath)
-			mnt_change_mountpoint(source_mnt, smp, top_mnt);
+			mnt_change_mountpoint(top, smp, top_mnt);
 		commit_tree(source_mnt);
 	}
 
@@ -2732,8 +2745,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		child->mnt.mnt_flags &= ~MNT_LOCKED;
 		q = __lookup_mnt(&child->mnt_parent->mnt,
 				 child->mnt_mountpoint);
-		if (q)
-			mnt_change_mountpoint(child, smp, q);
+		if (q) {
+			struct mount *r = child;
+			while (unlikely(r->overmount))
+				r = r->overmount;
+			mnt_change_mountpoint(r, secondary, q);
+		}
 		commit_tree(child);
 	}
 	put_mountpoint(smp);

From cf53a2d423c11ed70611e7b3f0878d6e419e348a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 1 Jun 2025 00:34:32 -0400
Subject: [PATCH 04/48] copy_tree(): don't set ->mnt_mountpoint on the root of
 copy

It never made any sense - neither when copy_tree() had been introduced
(2.4.11-pre5), nor at any point afterwards.  Mountpoint is meaningless
without parent mount and the root of copied tree has no parent until we get
around to attaching it somewhere.  At that time we'll have mountpoint set;
before that we have no idea which dentry will be used as mountpoint.
IOW, copy_tree() should just leave the default value.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 9b732d74c2cc..f0a56dbceff9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2222,7 +2222,6 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
 		return dst_mnt;
 
 	src_parent = src_root;
-	dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint;
 
 	list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
 		if (!is_subdir(src_root_child->mnt_mountpoint, dentry))

From 0e84653ea596bf9f5bfea58b0a34e0d9f72236c4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 28 Apr 2025 21:48:45 -0400
Subject: [PATCH 05/48] constify mnt_has_parent()

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mount.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/mount.h b/fs/mount.h
index b8beafdd6d24..c4d417cd7953 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -102,7 +102,7 @@ static inline struct mount *real_mount(struct vfsmount *mnt)
 	return container_of(mnt, struct mount, mnt);
 }
 
-static inline int mnt_has_parent(struct mount *mnt)
+static inline int mnt_has_parent(const struct mount *mnt)
 {
 	return mnt != mnt->mnt_parent;
 }

From 592238c03ef9e44ad6031b671da869ebcbffa8dc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 8 May 2025 17:28:00 -0400
Subject: [PATCH 06/48] pnode: lift peers() into pnode.h

it's going to be useful both in pnode.c and namespace.c

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 5 -----
 fs/pnode.h | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index ffd429b760d5..aa187144e389 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -216,11 +216,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 static struct mount *last_dest, *first_source, *last_source, *dest_master;
 static struct hlist_head *list;
 
-static inline bool peers(const struct mount *m1, const struct mount *m2)
-{
-	return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
-}
-
 static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 {
 	struct mount *child;
diff --git a/fs/pnode.h b/fs/pnode.h
index 2d026fb98b18..93fa9311bd07 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -34,6 +34,11 @@ static inline void set_mnt_shared(struct mount *mnt)
 	mnt->mnt.mnt_flags |= MNT_SHARED;
 }
 
+static inline bool peers(const struct mount *m1, const struct mount *m2)
+{
+	return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
+}
+
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 		struct hlist_head *);

From 9cb79ed60e38a0767b5b94c31bf1abf6518bc6b9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 8 Jun 2025 23:10:33 -0400
Subject: [PATCH 07/48] new predicate: mount_is_ancestor()

mount_is_ancestor(p1, p2) returns true iff there is a possibly
empty ancestry chain from p1 to p2.

Convert the open-coded checks.  Unlike those open-coded variants
it does not depend upon p1 not being root...

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index f0a56dbceff9..aa93e1a48b5d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3483,6 +3483,17 @@ static inline bool path_overmounted(const struct path *path)
 	return unlikely(!no_child);
 }
 
+/*
+ * Check if there is a possibly empty chain of descent from p1 to p2.
+ * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl).
+ */
+static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
+{
+	while (p2 != p1 && mnt_has_parent(p2))
+		p2 = p2->mnt_parent;
+	return p2 == p1;
+}
+
 /**
  * can_move_mount_beneath - check that we can mount beneath the top mount
  * @from: mount to mount beneath
@@ -3534,9 +3545,8 @@ static int can_move_mount_beneath(const struct path *from,
 	if (parent_mnt_to == current->nsproxy->mnt_ns->root)
 		return -EINVAL;
 
-	for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent)
-		if (p == mnt_to)
-			return -EINVAL;
+	if (mount_is_ancestor(mnt_to, mnt_from))
+		return -EINVAL;
 
 	/*
 	 * If the parent mount propagates to the child mount this would
@@ -3705,9 +3715,8 @@ static int do_move_mount(struct path *old_path,
 	err = -ELOOP;
 	if (!check_for_nsfs_mounts(old))
 		goto out;
-	for (; mnt_has_parent(p); p = p->mnt_parent)
-		if (p == old)
-			goto out;
+	if (mount_is_ancestor(old, p))
+		goto out;
 
 	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
 	if (err)

From e031251cb249f824ad67cb0b2fc18b68d5792b8d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Jun 2025 22:03:17 -0400
Subject: [PATCH 08/48] constify is_local_mountpoint()

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mount.h     | 4 ++--
 fs/namespace.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index c4d417cd7953..f10776003643 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -147,8 +147,8 @@ struct proc_mounts {
 
 extern const struct seq_operations mounts_op;
 
-extern bool __is_local_mountpoint(struct dentry *dentry);
-static inline bool is_local_mountpoint(struct dentry *dentry)
+extern bool __is_local_mountpoint(const struct dentry *dentry);
+static inline bool is_local_mountpoint(const struct dentry *dentry)
 {
 	if (!d_mountpoint(dentry))
 		return false;
diff --git a/fs/namespace.c b/fs/namespace.c
index aa93e1a48b5d..c4feb8315978 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -894,7 +894,7 @@ struct vfsmount *lookup_mnt(const struct path *path)
  * namespace not just a mount that happens to have some specified
  * parent mount.
  */
-bool __is_local_mountpoint(struct dentry *dentry)
+bool __is_local_mountpoint(const struct dentry *dentry)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct mount *mnt, *n;

From 05da054d43770e229dfb0e185c15452eed14364c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 8 Jun 2025 23:25:36 -0400
Subject: [PATCH 09/48] new predicate: anon_ns_root(mount)

checks if mount is the root of an anonymouns namespace.
Switch open-coded equivalents to using it.

For mounts that belong to anon namespace !mnt_has_parent(mount)
is the same as mount == ns->root, and intent is more obvious in
the latter form.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mount.h     |  7 +++++++
 fs/namespace.c | 28 +++-------------------------
 2 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index f10776003643..f20e6ed845fe 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -161,6 +161,13 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
 	return ns->seq == 0;
 }
 
+static inline bool anon_ns_root(const struct mount *m)
+{
+	struct mnt_namespace *ns = READ_ONCE(m->mnt_ns);
+
+	return !IS_ERR_OR_NULL(ns) && is_anon_ns(ns) && m == ns->root;
+}
+
 static inline bool mnt_ns_attached(const struct mount *mnt)
 {
 	return !RB_EMPTY_NODE(&mnt->mnt_node);
diff --git a/fs/namespace.c b/fs/namespace.c
index c4feb8315978..ea01fea2ac93 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2480,9 +2480,7 @@ struct vfsmount *clone_private_mount(const struct path *path)
 	 * loops get created.
 	 */
 	if (!check_mnt(old_mnt)) {
-		if (!is_mounted(&old_mnt->mnt) ||
-			!is_anon_ns(old_mnt->mnt_ns) ||
-			mnt_has_parent(old_mnt))
+		if (!anon_ns_root(old_mnt))
 			return ERR_PTR(-EINVAL);
 
 		if (!check_for_nsfs_mounts(old_mnt))
@@ -3649,9 +3647,6 @@ static int do_move_mount(struct path *old_path,
 	ns = old->mnt_ns;
 
 	err = -EINVAL;
-	/* The thing moved must be mounted... */
-	if (!is_mounted(&old->mnt))
-		goto out;
 
 	if (check_mnt(old)) {
 		/* if the source is in our namespace... */
@@ -3664,10 +3659,8 @@ static int do_move_mount(struct path *old_path,
 	} else {
 		/*
 		 * otherwise the source must be the root of some anon namespace.
-		 * AV: check for mount being root of an anon namespace is worth
-		 * an inlined predicate...
 		 */
-		if (!is_anon_ns(ns) || mnt_has_parent(old))
+		if (!anon_ns_root(old))
 			goto out;
 		/*
 		 * Bail out early if the target is within the same namespace -
@@ -5028,22 +5021,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
 	err = -EINVAL;
 	lock_mount_hash();
 
-	/* Ensure that this isn't anything purely vfs internal. */
-	if (!is_mounted(&mnt->mnt))
-		goto out;
-
-	/*
-	 * If this is an attached mount make sure it's located in the callers
-	 * mount namespace. If it's not don't let the caller interact with it.
-	 *
-	 * If this mount doesn't have a parent it's most often simply a
-	 * detached mount with an anonymous mount namespace. IOW, something
-	 * that's simply not attached yet. But there are apparently also users
-	 * that do change mount properties on the rootfs itself. That obviously
-	 * neither has a parent nor is it a detached mount so we cannot
-	 * unconditionally check for detached mounts.
-	 */
-	if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
+	if (!anon_ns_root(mnt) && !check_mnt(mnt))
 		goto out;
 
 	/*

From 9ed4b9eaeaa71cb0db4ec8460b5edd390aef58dd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 8 Jun 2025 23:41:23 -0400
Subject: [PATCH 10/48] dissolve_on_fput(): use anon_ns_root()

that's the condition we are actually trying to check there...

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 64 +++++++++++---------------------------------------
 1 file changed, 14 insertions(+), 50 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index ea01fea2ac93..151d5f3360b9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2333,67 +2333,31 @@ void drop_collected_paths(struct path *paths, struct path *prealloc)
 static void free_mnt_ns(struct mnt_namespace *);
 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
 
-static inline bool must_dissolve(struct mnt_namespace *mnt_ns)
-{
-	/*
-        * This mount belonged to an anonymous mount namespace
-        * but was moved to a non-anonymous mount namespace and
-        * then unmounted.
-        */
-	if (unlikely(!mnt_ns))
-		return false;
-
-	/*
-        * This mount belongs to a non-anonymous mount namespace
-        * and we know that such a mount can never transition to
-        * an anonymous mount namespace again.
-        */
-	if (!is_anon_ns(mnt_ns)) {
-		/*
-		 * A detached mount either belongs to an anonymous mount
-		 * namespace or a non-anonymous mount namespace. It
-		 * should never belong to something purely internal.
-		 */
-		VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL);
-		return false;
-	}
-
-	return true;
-}
-
 void dissolve_on_fput(struct vfsmount *mnt)
 {
 	struct mnt_namespace *ns;
 	struct mount *m = real_mount(mnt);
 
+	/*
+	 * m used to be the root of anon namespace; if it still is one,
+	 * we need to dissolve the mount tree and free that namespace.
+	 * Let's try to avoid taking namespace_sem if we can determine
+	 * that there's nothing to do without it - rcu_read_lock() is
+	 * enough to make anon_ns_root() memory-safe and once m has
+	 * left its namespace, it's no longer our concern, since it will
+	 * never become a root of anon ns again.
+	 */
+
 	scoped_guard(rcu) {
-		if (!must_dissolve(READ_ONCE(m->mnt_ns)))
+		if (!anon_ns_root(m))
 			return;
 	}
 
 	scoped_guard(namespace_lock, &namespace_sem) {
+		if (!anon_ns_root(m))
+			return;
+
 		ns = m->mnt_ns;
-		if (!must_dissolve(ns))
-			return;
-
-		/*
-		 * After must_dissolve() we know that this is a detached
-		 * mount in an anonymous mount namespace.
-		 *
-		 * Now when mnt_has_parent() reports that this mount
-		 * tree has a parent, we know that this anonymous mount
-		 * tree has been moved to another anonymous mount
-		 * namespace.
-		 *
-		 * So when closing this file we cannot unmount the mount
-		 * tree. This will be done when the file referring to
-		 * the root of the anonymous mount namespace will be
-		 * closed (It could already be closed but it would sync
-		 * on @namespace_sem and wait for us to finish.).
-		 */
-		if (mnt_has_parent(m))
-			return;
-
 		lock_mount_hash();
 		umount_tree(m, UMOUNT_CONNECTED);
 		unlock_mount_hash();

From 1a867d729f951f1f0ef73c94d2f739f959cd8699 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 17 Jun 2025 21:10:02 -0400
Subject: [PATCH 11/48] __attach_mnt(): lose the second argument

It's always ->mnt_parent of the first one.  What the function does is
making a mount (with already set parent and mountpoint) visible - in
mount hash and in the parent's list of children.

IOW, it takes the existing rootwards linkage and sets the matching
crownwards linkage.

Renamed to make_visible(), while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 151d5f3360b9..75d45d0b615c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1079,8 +1079,9 @@ void mnt_set_mountpoint(struct mount *mnt,
 	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
 }
 
-static void __attach_mnt(struct mount *mnt, struct mount *parent)
+static void make_visible(struct mount *mnt)
 {
+	struct mount *parent = mnt->mnt_parent;
 	if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root))
 		parent->overmount = mnt;
 	hlist_add_head_rcu(&mnt->mnt_hash,
@@ -1098,7 +1099,7 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent)
  * Mount @mnt at @mp on @parent. Then attach @mnt
  * to @parent's child mount list and to @mount_hashtable.
  *
- * Note, when __attach_mnt() is called @mnt->mnt_parent already points
+ * Note, when make_visible() is called @mnt->mnt_parent already points
  * to the correct parent.
  *
  * Context: This function expects namespace_lock() and lock_mount_hash()
@@ -1108,7 +1109,7 @@ static void attach_mnt(struct mount *mnt, struct mount *parent,
 		       struct mountpoint *mp)
 {
 	mnt_set_mountpoint(parent, mp, mnt);
-	__attach_mnt(mnt, mnt->mnt_parent);
+	make_visible(mnt);
 }
 
 void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
@@ -1182,7 +1183,7 @@ static void commit_tree(struct mount *mnt)
 	n->nr_mounts += n->pending_mounts;
 	n->pending_mounts = 0;
 
-	__attach_mnt(mnt, parent);
+	make_visible(mnt);
 	touch_mnt_namespace(n);
 }
 
@@ -2679,7 +2680,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		if (beneath)
 			mnt_change_mountpoint(top, smp, top_mnt);
-		__attach_mnt(source_mnt, source_mnt->mnt_parent);
+		make_visible(source_mnt);
 		mnt_notify_add(source_mnt);
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {

From d08fa7f44ae7f5ad5c842e15f2412790736a6144 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 6 May 2025 18:48:05 -0400
Subject: [PATCH 12/48] don't set MNT_LOCKED on parentless mounts

Originally MNT_LOCKED meant only one thing - "don't let this mount to
be peeled off its parent, we don't want to have its mountpoint exposed".
Accordingly, it had only been set on mounts that *do* have a parent.
Later it got overloaded with another use - setting it on the absolute
root had given free protection against umount(2) of absolute root
(was possible to trigger, oopsed).  Not a bad trick, but it ended
up costing more than it bought us.  Unfortunately, the cost included
both hard-to-reason-about logics and a subtle race between
mount -o remount,ro and mount --[r]bind - lockless &= ~MNT_LOCKED in
the end of __do_loopback() could race with sb_prepare_remount_readonly()
setting and clearing MNT_HOLD_WRITE (under mount_lock, as it should
be).  The race wouldn't be much of a problem (there are other ways to
deal with it), but the subtlety is.

Turns out that nobody except umount(2) had ever made use of having
MNT_LOCKED set on absolute root.  So let's give up on that trick,
clever as it had been, add an explicit check in do_umount() and
return to using MNT_LOCKED only for mounts that have a parent.

It means that
	* clone_mnt() no longer copies MNT_LOCKED
	* copy_tree() sets it on submounts if their counterparts had
been marked such, and does that right next to attach_mnt() in there,
in the same mount_lock scope.
	* __do_loopback() no longer needs to strip MNT_LOCKED off the
root of subtree it's about to return; no store, no race.
	* init_mount_tree() doesn't bother setting MNT_LOCKED on absolute
root.
	* lock_mnt_tree() does not set MNT_LOCKED on the subtree's root;
accordingly, its caller (loop in attach_recursive_mnt()) does not need to
bother stripping that MNT_LOCKED on root.  Note that lock_mnt_tree() setting
MNT_LOCKED on submounts happens in the same mount_lock scope as __attach_mnt()
(from commit_tree()) that makes them reachable.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 75d45d0b615c..791904128f1e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1313,7 +1313,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	}
 
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
-	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
+	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_LOCKED);
 
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@@ -1988,6 +1988,9 @@ static int do_umount(struct mount *mnt, int flags)
 	if (mnt->mnt.mnt_flags & MNT_LOCKED)
 		goto out;
 
+	if (!mnt_has_parent(mnt)) /* not the absolute root */
+		goto out;
+
 	event++;
 	if (flags & MNT_DETACH) {
 		if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
@@ -2257,6 +2260,8 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
 			if (IS_ERR(dst_mnt))
 				goto out;
 			lock_mount_hash();
+			if (src_mnt->mnt.mnt_flags & MNT_LOCKED)
+				dst_mnt->mnt.mnt_flags |= MNT_LOCKED;
 			list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
 			attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp);
 			unlock_mount_hash();
@@ -2489,7 +2494,7 @@ static void lock_mnt_tree(struct mount *mnt)
 		if (flags & MNT_NOEXEC)
 			flags |= MNT_LOCK_NOEXEC;
 		/* Don't allow unprivileged users to reveal what is under a mount */
-		if (list_empty(&p->mnt_expire))
+		if (list_empty(&p->mnt_expire) && p != mnt)
 			flags |= MNT_LOCKED;
 		p->mnt.mnt_flags = flags;
 	}
@@ -2704,7 +2709,6 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		/* Notice when we are propagating across user namespaces */
 		if (child->mnt_parent->mnt_ns->user_ns != user_ns)
 			lock_mnt_tree(child);
-		child->mnt.mnt_flags &= ~MNT_LOCKED;
 		q = __lookup_mnt(&child->mnt_parent->mnt,
 				 child->mnt_mountpoint);
 		if (q) {
@@ -2985,26 +2989,21 @@ static inline bool may_copy_tree(struct path *path)
 
 static struct mount *__do_loopback(struct path *old_path, int recurse)
 {
-	struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
+	struct mount *old = real_mount(old_path->mnt);
 
 	if (IS_MNT_UNBINDABLE(old))
-		return mnt;
+		return ERR_PTR(-EINVAL);
 
 	if (!may_copy_tree(old_path))
-		return mnt;
+		return ERR_PTR(-EINVAL);
 
 	if (!recurse && __has_locked_children(old, old_path->dentry))
-		return mnt;
+		return ERR_PTR(-EINVAL);
 
 	if (recurse)
-		mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
+		return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
 	else
-		mnt = clone_mnt(old, old_path->dentry, 0);
-
-	if (!IS_ERR(mnt))
-		mnt->mnt.mnt_flags &= ~MNT_LOCKED;
-
-	return mnt;
+		return clone_mnt(old, old_path->dentry, 0);
 }
 
 /*
@@ -4749,11 +4748,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	if (!path_mounted(&root))
 		goto out4; /* not a mountpoint */
 	if (!mnt_has_parent(root_mnt))
-		goto out4; /* not attached */
+		goto out4; /* absolute root */
 	if (!path_mounted(&new))
 		goto out4; /* not a mountpoint */
 	if (!mnt_has_parent(new_mnt))
-		goto out4; /* not attached */
+		goto out4; /* absolute root */
 	/* make sure we can reach put_old from new_root */
 	if (!is_path_reachable(old_mnt, old.dentry, &new))
 		goto out4;
@@ -6154,7 +6153,6 @@ static void __init init_mount_tree(void)
 
 	root.mnt = mnt;
 	root.dentry = mnt->mnt_root;
-	mnt->mnt_flags |= MNT_LOCKED;
 
 	set_fs_pwd(current->fs, &root);
 	set_fs_root(current->fs, &root);

From 49acacdc7cd3c1dcf5190a80ea9e6ca2ffa06cec Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 May 2025 14:05:50 -0400
Subject: [PATCH 13/48] clone_mnt(): simplify the propagation-related logics

The underlying rules are simple:
	* MNT_SHARED should be set iff ->mnt_group_id of new mount ends up
non-zero.
	* mounts should be on the same ->mnt_share cyclic list iff they have
the same non-zero ->mnt_group_id value.
	* CL_PRIVATE is mutually exclusive with MNT_SHARED, MNT_SLAVE,
MNT_SHARED_TO_SLAVE and MNT_EXPIRE; the whole point of that thing is to
get a clone of old mount that would *not* be on any namespace-related
lists.

The above allows to make the logics more straightforward; what's more,
it makes the proof that invariants are maintained much simpler.
The variant in mainline is safe (aside of a very narrow race with
unsafe modification of mnt_flags right after we had the mount exposed
in superblock's ->s_mounts; theoretically it can race with ro remount
of the original, but it's not easy to hit), but proof of its correctness
is really unpleasant.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 791904128f1e..12cf69da4320 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1301,6 +1301,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 
+	mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) &
+			     ~MNT_INTERNAL_FLAGS;
+
 	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
 		mnt->mnt_group_id = 0; /* not a peer of original */
 	else
@@ -1312,8 +1315,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 			goto out_free;
 	}
 
-	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
-	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_LOCKED);
+	if (mnt->mnt_group_id)
+		set_mnt_shared(mnt);
 
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@@ -1326,22 +1329,20 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
 	unlock_mount_hash();
 
+	if (flag & CL_PRIVATE)	// we are done with it
+		return mnt;
+
+	if (peers(mnt, old))
+		list_add(&mnt->mnt_share, &old->mnt_share);
+
 	if ((flag & CL_SLAVE) ||
 	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
 		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
 		mnt->mnt_master = old;
-		CLEAR_MNT_SHARED(mnt);
-	} else if (!(flag & CL_PRIVATE)) {
-		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
-			list_add(&mnt->mnt_share, &old->mnt_share);
-		if (IS_MNT_SLAVE(old))
-			list_add(&mnt->mnt_slave, &old->mnt_slave);
+	} else if (IS_MNT_SLAVE(old)) {
+		list_add(&mnt->mnt_slave, &old->mnt_slave);
 		mnt->mnt_master = old->mnt_master;
-	} else {
-		CLEAR_MNT_SHARED(mnt);
 	}
-	if (flag & CL_MAKE_SHARED)
-		set_mnt_shared(mnt);
 
 	/* stick the duplicate mount on the same expiry list
 	 * as the original if that was on one */
@@ -1349,7 +1350,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 		if (!list_empty(&old->mnt_expire))
 			list_add(&mnt->mnt_expire, &old->mnt_expire);
 	}
-
 	return mnt;
 
  out_free:

From c93ff74ff1cbc0ab152cebed3e44223fee3c70a2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Apr 2025 19:17:28 -0400
Subject: [PATCH 14/48] do_umount(): simplify the "is it still mounted" checks

Calls of do_umount() are always preceded by can_umount(), where we'd
done a racy check for mount belonging to our namespace; if it wasn't,
can_unmount() would've failed with -EINVAL and we wouldn't have
reached do_umount() at all.

That check needs to be redone once we have acquired namespace_sem
and in do_umount() we do that.  However, that's done in a very odd
way; we check that mount is still in rbtree of _some_ namespace or
its mnt_list is not empty.  It is equivalent to check_mnt(mnt) -
we know that earlier mnt was mounted in our namespace; if it has
stayed there, it's going to remain in rbtree of our namespace.
OTOH, if it ever had been removed from out namespace, it would be
removed from rbtree and it never would've re-added to a namespace
afterwards.  As for ->mnt_list, for something that had been mounted
in a namespace we'll never observe non-empty ->mnt_list while holding
namespace_sem - it does temporarily become non-empty during
umount_tree(), but that doesn't outlast the call of umount_tree(),
let alone dropping namespace_sem.

Things get much easier to follow if we replace that with (equivalent)
check_mnt(mnt) there.  What's more, currently we treat a failure of
that test as "quietly do nothing"; we might as well pretend that we'd
lost the race and fail on that the same way can_umount() would have.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 12cf69da4320..57b0974a5d1e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1983,8 +1983,11 @@ static int do_umount(struct mount *mnt, int flags)
 	namespace_lock();
 	lock_mount_hash();
 
-	/* Recheck MNT_LOCKED with the locks held */
+	/* Repeat the earlier racy checks, now that we are holding the locks */
 	retval = -EINVAL;
+	if (!check_mnt(mnt))
+		goto out;
+
 	if (mnt->mnt.mnt_flags & MNT_LOCKED)
 		goto out;
 
@@ -1993,16 +1996,14 @@ static int do_umount(struct mount *mnt, int flags)
 
 	event++;
 	if (flags & MNT_DETACH) {
-		if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
-			umount_tree(mnt, UMOUNT_PROPAGATE);
+		umount_tree(mnt, UMOUNT_PROPAGATE);
 		retval = 0;
 	} else {
 		smp_mb(); // paired with __legitimize_mnt()
 		shrink_submounts(mnt);
 		retval = -EBUSY;
 		if (!propagate_mount_busy(mnt, 2)) {
-			if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
-				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
+			umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
 			retval = 0;
 		}
 	}

From 24368a744bafce7daf1eafd6a163871925ee5892 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 2 May 2025 21:32:01 -0400
Subject: [PATCH 15/48] sanitize handling of long-term internal mounts

Original rationale for those had been the reduced cost of mntput()
for the stuff that is mounted somewhere.  Mount refcount increments and
decrements are frequent; what's worse, they tend to concentrate on the
same instances and cacheline pingpong is quite noticable.

As the result, mount refcounts are per-cpu; that allows a very cheap
increment.  Plain decrement would be just as easy, but decrement-and-test
is anything but (we need to add the components up, with exclusion against
possible increment-from-zero, etc.).

Fortunately, there is a very common case where we can tell that decrement
won't be the final one - if the thing we are dropping is currently
mounted somewhere.  We have an RCU delay between the removal from mount
tree and dropping the reference that used to pin it there, so we can
just take rcu_read_lock() and check if the victim is mounted somewhere.
If it is, we can go ahead and decrement without and further checks -
the reference we are dropping is not the last one.  If it isn't, we
get all the fun with locking, carefully adding up components, etc.,
but the majority of refcount decrements end up taking the fast path.

There is a major exception, though - pipes and sockets.  Those live
on the internal filesystems that are not going to be mounted anywhere.
They are not going to be _un_mounted, of course, so having to take the
slow path every time a pipe or socket gets closed is really obnoxious.
Solution had been to mark them as long-lived ones - essentially faking
"they are mounted somewhere" indicator.

With minor modification that works even for ones that do eventually get
dropped - all it takes is making sure we have an RCU delay between
clearing the "mounted somewhere" indicator and dropping the reference.

There are some additional twists (if you want to drop a dozen of such
internal mounts, you'd be better off with clearing the indicator on
all of them, doing an RCU delay once, then dropping the references),
but in the basic form it had been
	* use kern_mount() if you want your internal mount to be
a long-term one.
	* use kern_unmount() to undo that.

Unfortunately, the things did rot a bit during the mount API reshuffling.
In several cases we have lost the "fake the indicator" part; kern_unmount()
on the unmount side remained (it doesn't warn if you use it on a mount
without the indicator), but all benefits regaring mntput() cost had been
lost.

To get rid of that bitrot, let's add a new helper that would work
with fs_context-based API: fc_mount_longterm().  It's a counterpart
of fc_mount() that does, on success, mark its result as long-term.
It must be paired with kern_unmount() or equivalents.

Converted:
	1) mqueue (it used to use kern_mount_data() and the umount side
is still as it used to be)
	2) hugetlbfs (used to use kern_mount_data(), internal mount is
never unmounted in this one)
	3) i915 gemfs (used to be kern_mount() + manual remount to set
options, still uses kern_unmount() on umount side)
	4) v3d gemfs (copied from i915)

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/gpu/drm/i915/gem/i915_gemfs.c | 21 ++++++++++++++++++---
 drivers/gpu/drm/v3d/v3d_gemfs.c       | 21 ++++++++++++++++++---
 fs/hugetlbfs/inode.c                  |  2 +-
 fs/namespace.c                        |  9 +++++++++
 include/linux/mount.h                 |  1 +
 ipc/mqueue.c                          |  2 +-
 6 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gemfs.c b/drivers/gpu/drm/i915/gem/i915_gemfs.c
index 65d84a93c525..a09e2eb47175 100644
--- a/drivers/gpu/drm/i915/gem/i915_gemfs.c
+++ b/drivers/gpu/drm/i915/gem/i915_gemfs.c
@@ -5,16 +5,23 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/fs_context.h>
 
 #include "i915_drv.h"
 #include "i915_gemfs.h"
 #include "i915_utils.h"
 
+static int add_param(struct fs_context *fc, const char *key, const char *val)
+{
+	return vfs_parse_fs_string(fc, key, val, strlen(val));
+}
+
 void i915_gemfs_init(struct drm_i915_private *i915)
 {
-	char huge_opt[] = "huge=within_size"; /* r/w */
 	struct file_system_type *type;
+	struct fs_context *fc;
 	struct vfsmount *gemfs;
+	int ret;
 
 	/*
 	 * By creating our own shmemfs mountpoint, we can pass in
@@ -38,8 +45,16 @@ void i915_gemfs_init(struct drm_i915_private *i915)
 	if (!type)
 		goto err;
 
-	gemfs = vfs_kern_mount(type, SB_KERNMOUNT, type->name, huge_opt);
-	if (IS_ERR(gemfs))
+	fc = fs_context_for_mount(type, SB_KERNMOUNT);
+	if (IS_ERR(fc))
+		goto err;
+	ret = add_param(fc, "source", "tmpfs");
+	if (!ret)
+		ret = add_param(fc, "huge", "within_size");
+	if (!ret)
+		gemfs = fc_mount_longterm(fc);
+	put_fs_context(fc);
+	if (ret)
 		goto err;
 
 	i915->mm.gemfs = gemfs;
diff --git a/drivers/gpu/drm/v3d/v3d_gemfs.c b/drivers/gpu/drm/v3d/v3d_gemfs.c
index 4c5e18590a5c..8ec6ed82b3d9 100644
--- a/drivers/gpu/drm/v3d/v3d_gemfs.c
+++ b/drivers/gpu/drm/v3d/v3d_gemfs.c
@@ -3,14 +3,21 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/fs_context.h>
 
 #include "v3d_drv.h"
 
+static int add_param(struct fs_context *fc, const char *key, const char *val)
+{
+	return vfs_parse_fs_string(fc, key, val, strlen(val));
+}
+
 void v3d_gemfs_init(struct v3d_dev *v3d)
 {
-	char huge_opt[] = "huge=within_size";
 	struct file_system_type *type;
+	struct fs_context *fc;
 	struct vfsmount *gemfs;
+	int ret;
 
 	/*
 	 * By creating our own shmemfs mountpoint, we can pass in
@@ -28,8 +35,16 @@ void v3d_gemfs_init(struct v3d_dev *v3d)
 	if (!type)
 		goto err;
 
-	gemfs = vfs_kern_mount(type, SB_KERNMOUNT, type->name, huge_opt);
-	if (IS_ERR(gemfs))
+	fc = fs_context_for_mount(type, SB_KERNMOUNT);
+	if (IS_ERR(fc))
+		goto err;
+	ret = add_param(fc, "source", "tmpfs");
+	if (!ret)
+		ret = add_param(fc, "huge", "within_size");
+	if (!ret)
+		gemfs = fc_mount_longterm(fc);
+	put_fs_context(fc);
+	if (ret)
 		goto err;
 
 	v3d->gemfs = gemfs;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e4de5425838d..4e0397775167 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1587,7 +1587,7 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
 	} else {
 		struct hugetlbfs_fs_context *ctx = fc->fs_private;
 		ctx->hstate = h;
-		mnt = fc_mount(fc);
+		mnt = fc_mount_longterm(fc);
 		put_fs_context(fc);
 	}
 	if (IS_ERR(mnt))
diff --git a/fs/namespace.c b/fs/namespace.c
index 57b0974a5d1e..6a0697eeda74 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1260,6 +1260,15 @@ struct vfsmount *fc_mount(struct fs_context *fc)
 }
 EXPORT_SYMBOL(fc_mount);
 
+struct vfsmount *fc_mount_longterm(struct fs_context *fc)
+{
+	struct vfsmount *mnt = fc_mount(fc);
+	if (!IS_ERR(mnt))
+		real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
+	return mnt;
+}
+EXPORT_SYMBOL(fc_mount_longterm);
+
 struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				int flags, const char *name,
 				void *data)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 1a508beba446..c145820fcbbf 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -98,6 +98,7 @@ int mnt_get_write_access(struct vfsmount *mnt);
 void mnt_put_write_access(struct vfsmount *mnt);
 
 extern struct vfsmount *fc_mount(struct fs_context *fc);
+extern struct vfsmount *fc_mount_longterm(struct fs_context *fc);
 extern struct vfsmount *vfs_create_mount(struct fs_context *fc);
 extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				      int flags, const char *name,
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 82ed2d3c9846..de7432efbf4a 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -482,7 +482,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns)
 	put_user_ns(fc->user_ns);
 	fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns);
 
-	mnt = fc_mount(fc);
+	mnt = fc_mount_longterm(fc);
 	put_fs_context(fc);
 	return mnt;
 }

From f0d0ba19985d23a3e83d654318ccb6e9c5f1b095 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 14 May 2025 20:50:06 -0400
Subject: [PATCH 16/48] Rewrite of propagate_umount()

The variant currently in the tree has problems; trying to prove
correctness has caught at least one class of bugs (reparenting
that ends up moving the visible location of reparented mount, due
to not excluding some of the counterparts on propagation that
should've been included).

I tried to prove that it's the only bug there; I'm still not sure
whether it is.  If anyone can reconstruct and write down an analysis
of the mainline implementation, I'll gladly review it; as it is,
I ended up doing a different implementation.  Candidate collection
phase is similar, but trimming the set down until it satisfies the
constraints turned out pretty different.

I hoped to do transformation as a massage series, but that turns out
to be too convoluted.  So it's a single patch replacing propagate_umount()
and friends in one go, with notes and analysis in D/f/propagate_umount.txt
(in addition to inline comments).

As far I can tell, it is provably correct and provably linear by the number
of mounts we need to look at in order to decide what should be unmounted.
It even builds and seems to survive testing...

Another nice thing that fell out of that is that ->mnt_umounting is no longer
needed.

Compared to the first version:
	* explicit MNT_UMOUNT_CANDIDATE flag for is_candidate()
	* trim_ancestors() only clears that flag, leaving the suckers on list
	* trim_one() and handle_locked() take the stuff with flag cleared off
the list.  That allows to iterate with list_for_each_entry_safe() when calling
trim_one() - it removes at most one element from the list now.
	* no globals - I didn't bother with any kind of context, not worth it.

	* Notes updated accordingly; I have not touch the terms yet.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 .../filesystems/propagate_umount.txt          | 484 ++++++++++++++++++
 fs/mount.h                                    |   1 -
 fs/namespace.c                                |   1 -
 fs/pnode.c                                    | 388 +++++++-------
 fs/pnode.h                                    |   2 +-
 include/linux/mount.h                         |   3 +-
 6 files changed, 698 insertions(+), 181 deletions(-)
 create mode 100644 Documentation/filesystems/propagate_umount.txt

diff --git a/Documentation/filesystems/propagate_umount.txt b/Documentation/filesystems/propagate_umount.txt
new file mode 100644
index 000000000000..6906903a8aa2
--- /dev/null
+++ b/Documentation/filesystems/propagate_umount.txt
@@ -0,0 +1,484 @@
+	Notes on propagate_umount()
+
+Umount propagation starts with a set of mounts we are already going to
+take out.  Ideally, we would like to add all downstream cognates to
+that set - anything with the same mountpoint as one of the removed
+mounts and with parent that would receive events from the parent of that
+mount.  However, there are some constraints the resulting set must
+satisfy.
+
+It is convenient to define several properties of sets of mounts:
+
+1) A set S of mounts is non-shifting if for any mount X belonging
+to S all subtrees mounted strictly inside of X (i.e. not overmounting
+the root of X) contain only elements of S.
+
+2) A set S is non-revealing if all locked mounts that belong to S have
+parents that also belong to S.
+
+3) A set S is closed if it contains all children of its elements.
+
+The set of mounts taken out by umount(2) must be non-shifting and
+non-revealing; the first constraint is what allows to reparent
+any remaining mounts and the second is what prevents the exposure
+of any concealed mountpoints.
+
+propagate_umount() takes the original set as an argument and tries to
+extend that set.  The original set is a full subtree and its root is
+unlocked; what matters is that it's closed and non-revealing.
+Resulting set may not be closed; there might still be mounts outside
+of that set, but only on top of stacks of root-overmounting elements
+of set.  They can be reparented to the place where the bottom of
+stack is attached to a mount that will survive.  NOTE: doing that
+will violate a constraint on having no more than one mount with
+the same parent/mountpoint pair; however, the caller (umount_tree())
+will immediately remedy that - it may keep unmounted element attached
+to parent, but only if the parent itself is unmounted.  Since all
+conflicts created by reparenting have common parent *not* in the
+set and one side of the conflict (bottom of the stack of overmounts)
+is in the set, it will be resolved.  However, we rely upon umount_tree()
+doing that pretty much immediately after the call of propagate_umount().
+
+Algorithm is based on two statements:
+	1) for any set S, there is a maximal non-shifting subset of S
+and it can be calculated in O(#S) time.
+	2) for any non-shifting set S, there is a maximal non-revealing
+subset of S.  That subset is also non-shifting and it can be calculated
+in O(#S) time.
+
+		Finding candidates.
+
+We are given a closed set U and we want to find all mounts that have
+the same mountpoint as some mount m in U *and* whose parent receives
+propagation from the parent of the same mount m.  Naive implementation
+would be
+	S = {}
+	for each m in U
+		add m to S
+		p = parent(m)
+		for each q in Propagation(p) - {p}
+			child = look_up(q, mountpoint(m))
+			if child
+				add child to S
+but that can lead to excessive work - there might be propagation among the
+subtrees of U, in which case we'd end up examining the same candidates
+many times.  Since propagation is transitive, the same will happen to
+everything downstream of that candidate and it's not hard to construct
+cases where the approach above leads to the time quadratic by the actual
+number of candidates.
+
+Note that if we run into a candidate we'd already seen, it must've been
+added on an earlier iteration of the outer loop - all additions made
+during one iteration of the outer loop have different parents.  So
+if we find a child already added to the set, we know that everything
+in Propagation(parent(child)) with the same mountpoint has been already
+added.
+	S = {}
+	for each m in U
+		if m in S
+			continue
+		add m to S
+		p = parent(m)
+		q = propagation_next(p, p)
+		while q
+			child = look_up(q, mountpoint(m))
+			if child
+				if child in S
+					q = skip_them(q, p)
+					continue;
+				add child to S
+			q = propagation_next(q, p)
+where
+skip_them(q, p)
+	keep walking Propagation(p) from q until we find something
+	not in Propagation(q)
+
+would get rid of that problem, but we need a sane implementation of
+skip_them().  That's not hard to do - split propagation_next() into
+"down into mnt_slave_list" and "forward-and-up" parts, with the
+skip_them() being "repeat the forward-and-up part until we get NULL
+or something that isn't a peer of the one we are skipping".
+
+Note that there can be no absolute roots among the extra candidates -
+they all come from mount lookups.  Absolute root among the original
+set is _currently_ impossible, but it might be worth protecting
+against.
+
+		Maximal non-shifting subsets.
+
+Let's call a mount m in a set S forbidden in that set if there is a
+subtree mounted strictly inside m and containing mounts that do not
+belong to S.
+
+The set is non-shifting when none of its elements are forbidden in it.
+
+If mount m is forbidden in a set S, it is forbidden in any subset S' it
+belongs to.  In other words, it can't belong to any of the non-shifting
+subsets of S.  If we had a way to find a forbidden mount or show that
+there's none, we could use it to find the maximal non-shifting subset
+simply by finding and removing them until none remain.
+
+Suppose mount m is forbidden in S; then any mounts forbidden in S - {m}
+must have been forbidden in S itself.  Indeed, since m has descendents
+that do not belong to S, any subtree that fits into S will fit into
+S - {m} as well.
+
+So in principle we could go through elements of S, checking if they
+are forbidden in S and removing the ones that are.  Removals will
+not invalidate the checks done for earlier mounts - if they were not
+forbidden at the time we checked, they won't become forbidden later.
+It's too costly to be practical, but there is a similar approach that
+is linear by size of S.
+
+Let's say that mount x in a set S is forbidden by mount y, if
+	* both x and y belong to S.
+	* there is a chain of mounts starting at x and leaving S
+	  immediately after passing through y, with the first
+	  mountpoint strictly inside x.
+Note 1: x may be equal to y - that's the case when something not
+belonging to S is mounted strictly inside x.
+Note 2: if y does not belong to S, it can't forbid anything in S.
+Note 3: if y has no children outside of S, it can't forbid anything in S.
+
+It's easy to show that mount x is forbidden in S if and only if x is
+forbidden in S by some mount y.  And it's easy to find all mounts in S
+forbidden by a given mount.
+
+Consider the following operation:
+	Trim(S, m) = S - {x : x is forbidden by m in S}
+
+Note that if m does not belong to S or has no children outside of S we
+are guaranteed that Trim(S, m) is equal to S.
+
+The following is true: if x is forbidden by y in Trim(S, m), it was
+already forbidden by y in S.
+
+Proof: Suppose x is forbidden by y in Trim(S, m).  Then there is a
+chain of mounts (x_0 = x, ..., x_k = y, x_{k+1} = r), such that x_{k+1}
+is the first element that doesn't belong to Trim(S, m) and the
+mountpoint of x_1 is strictly inside x.  If mount r belongs to S, it must
+have been removed by Trim(S, m), i.e. it was forbidden in S by m.
+Then there was a mount chain from r to some child of m that stayed in
+S all the way until m, but that's impossible since x belongs to Trim(S, m)
+and prepending (x_0, ..., x_k) to that chain demonstrates that x is also
+forbidden in S by m, and thus can't belong to Trim(S, m).
+Therefore r can not belong to S and our chain demonstrates that
+x is forbidden by y in S.  QED.
+
+Corollary: no mount is forbidden by m in Trim(S, m).  Indeed, any
+such mount would have been forbidden by m in S and thus would have been
+in the part of S removed in Trim(S, m).
+
+Corollary: no mount is forbidden by m in Trim(Trim(S, m), n).  Indeed,
+any such would have to have been forbidden by m in Trim(S, m), which
+is impossible.
+
+Corollary: after
+	S = Trim(S, x_1)
+	S = Trim(S, x_2)
+	...
+	S = Trim(S, x_k)
+no mount remaining in S will be forbidden by either of x_1,...,x_k.
+
+The following will reduce S to its maximal non-shifting subset:
+	visited = {}
+	while S contains elements not belonging to visited
+		let m be an arbitrary such element of S
+		S = Trim(S, m)
+		add m to visited
+
+S never grows, so the number of elements of S not belonging to visited
+decreases at least by one on each iteration.  When the loop terminates,
+all mounts remaining in S belong to visited.  It's easy to see that at
+the beginning of each iteration no mount remaining in S will be forbidden
+by any element of visited.  In other words, no mount remaining in S will
+be forbidden, i.e. final value of S will be non-shifting.  It will be
+the maximal non-shifting subset, since we were removing only forbidden
+elements.
+
+	There are two difficulties in implementing the above in linear
+time, both due to the fact that Trim() might need to remove more than one
+element.  Naive implementation of Trim() is vulnerable to running into a
+long chain of mounts, each mounted on top of parent's root.  Nothing in
+that chain is forbidden, so nothing gets removed from it.  We need to
+recognize such chains and avoid walking them again on subsequent calls of
+Trim(), otherwise we will end up with worst-case time being quadratic by
+the number of elements in S.  Another difficulty is in implementing the
+outer loop - we need to iterate through all elements of a shrinking set.
+That would be trivial if we never removed more than one element at a time
+(linked list, with list_for_each_entry_safe for iterator), but we may
+need to remove more than one entry, possibly including the ones we have
+already visited.
+
+	Let's start with naive algorithm for Trim():
+
+Trim_one(m)
+	found = false
+	for each n in children(m)
+		if n not in S
+			found = true
+			if (mountpoint(n) != root(m))
+				remove m from S
+				break
+	if found
+		Trim_ancestors(m)
+
+Trim_ancestors(m)
+	for (; parent(m) in S; m = parent(m)) {
+		if (mountpoint(m) != root(parent(m)))
+			remove parent(m) from S
+	}
+
+If m belongs to S, Trim_one(m) will replace S with Trim(S, m).
+Proof:
+	Consider the chains excluding elements from Trim(S, m).  The last
+two elements in such chain are m and some child of m that does not belong
+to S.  If m has no such children, Trim(S, m) is equal to S.
+	m itself is removed if and only if the chain has exactly two
+elements, i.e. when the last element does not overmount the root of m.
+In other words, that happens when m has a child not in S that does not
+overmount the root of m.
+	All other elements to remove will be ancestors of m, such that
+the entire descent chain from them to m is contained in S.  Let
+(x_0, x_1, ..., x_k = m) be the longest such chain.  x_i needs to be
+removed if and only if x_{i+1} does not overmount its root.  It's easy
+to see that Trim_ancestors(m) will iterate through that chain from
+x_k to x_1 and that it will remove exactly the elements that need to be
+removed.
+
+	Note that if the loop in Trim_ancestors() walks into an already
+visited element, we are guaranteed that remaining iterations will see
+only elements that had already been visited and remove none of them.
+That's the weakness that makes it vulnerable to long chains of full
+overmounts.
+
+	It's easy to deal with, if we can afford setting marks on
+elements of S; we would mark all elements already visited by
+Trim_ancestors() and have it bail out as soon as it sees an already
+marked element.
+
+	The problems with iterating through the set can be dealt with in
+several ways, depending upon the representation we choose for our set.
+One useful observation is that we are given a closed subset in S - the
+original set passed to propagate_umount().  Its elements can neither
+forbid anything nor be forbidden by anything - all their descendents
+belong to S, so they can not occur anywhere in any excluding chain.
+In other words, the elements of that subset will remain in S until
+the end and Trim_one(S, m) is a no-op for all m from that subset.
+
+	That suggests keeping S as a disjoint union of a closed set U
+('will be unmounted, no matter what') and the set of all elements of
+S that do not belong to U.  That set ('candidates') is all we need
+to iterate through.  Let's represent it as a subset in a cyclic list,
+consisting of all list elements that are marked as candidates (initially -
+all of them).  Then we could have Trim_ancestors() only remove the mark,
+leaving the elements on the list.  Then Trim_one() would never remove
+anything other than its argument from the containing list, allowing to
+use list_for_each_entry_safe() as iterator.
+
+	Assuming that representation we get the following:
+
+	list_for_each_entry_safe(m, ..., Candidates, ...)
+		Trim_one(m)
+where
+Trim_one(m)
+	if (m is not marked as a candidate)
+		strip the "seen by Trim_ancestors" mark from m
+		remove m from the Candidates list
+		return
+		
+	remove_this = false
+	found = false
+	for each n in children(m)
+		if n not in S
+			found = true
+			if (mountpoint(n) != root(m))
+				remove_this = true
+				break
+	if found
+		Trim_ancestors(m)
+	if remove_this
+		strip the "seen by Trim_ancestors" mark from m
+		strip the "candidate" mark from m
+		remove m from the Candidate list
+
+Trim_ancestors(m)
+	for (p = parent(m); p is marked as candidate ; m = p, p = parent(p)) {
+		if m is marked as seen by Trim_ancestors
+			return
+		mark m as seen by Trim_ancestors
+		if (mountpoint(m) != root(p))
+			strip the "candidate" mark from p
+	}
+
+	Terminating condition in the loop in Trim_ancestors() is correct,
+since that that loop will never run into p belonging to U - p is always
+an ancestor of argument of Trim_one() and since U is closed, the argument
+of Trim_one() would also have to belong to U.  But Trim_one() is never
+called for elements of U.  In other words, p belongs to S if and only
+if it belongs to candidates.
+
+	Time complexity:
+* we get no more than O(#S) calls of Trim_one()
+* the loop over children in Trim_one() never looks at the same child
+twice through all the calls.
+* iterations of that loop for children in S are no more than O(#S)
+in the worst case
+* at most two children that are not elements of S are considered per
+call of Trim_one().
+* the loop in Trim_ancestors() sets its mark once per iteration and
+no element of S has is set more than once.
+
+	In the end we may have some elements excluded from S by
+Trim_ancestors() still stuck on the list.  We could do a separate
+loop removing them from the list (also no worse than O(#S) time),
+but it's easier to leave that until the next phase - there we will
+iterate through the candidates anyway.
+
+	The caller has already removed all elements of U from their parents'
+lists of children, which means that checking if child belongs to S is
+equivalent to checking if it's marked as a candidate; we'll never see
+the elements of U in the loop over children in Trim_one().
+
+	What's more, if we see that children(m) is empty and m is not
+locked, we can immediately move m into the committed subset (remove
+from the parent's list of children, etc.).  That's one fewer mount we'll
+have to look into when we check the list of children of its parent *and*
+when we get to building the non-revealing subset.
+
+		Maximal non-revealing subsets
+
+If S is not a non-revealing subset, there is a locked element x in S
+such that parent of x is not in S.
+
+Obviously, no non-revealing subset of S may contain x.  Removing such
+elements one by one will obviously end with the maximal non-revealing
+subset (possibly empty one).  Note that removal of an element will
+require removal of all its locked children, etc.
+
+If the set had been non-shifting, it will remain non-shifting after
+such removals.
+Proof: suppose S was non-shifting, x is a locked element of S, parent of x
+is not in S and S - {x} is not non-shifting.  Then there is an element m
+in S - {x} and a subtree mounted strictly inside m, such that m contains
+an element not in in S - {x}.  Since S is non-shifting, everything in
+that subtree must belong to S.  But that means that this subtree must
+contain x somewhere *and* that parent of x either belongs that subtree
+or is equal to m.  Either way it must belong to S.  Contradiction.
+
+// same representation as for finding maximal non-shifting subsets:
+// S is a disjoint union of a non-revealing set U (the ones we are committed
+// to unmount) and a set of candidates, represented as a subset of list
+// elements that have "is a candidate" mark on them.
+// Elements of U are removed from their parents' lists of children.
+// In the end candidates becomes empty and maximal non-revealing non-shifting
+// subset of S is now in U
+	while (Candidates list is non-empty)
+		handle_locked(first(Candidates))
+
+handle_locked(m)
+	if m is not marked as a candidate
+		strip the "seen by Trim_ancestors" mark from m
+		remove m from the list
+		return
+	cutoff = m
+	for (p = m; p in candidates; p = parent(p)) {
+		strip the "seen by Trim_ancestors" mark from p
+		strip the "candidate" mark from p
+		remove p from the Candidates list
+		if (!locked(p))
+			cutoff = parent(p)
+	}
+	if p in U
+		cutoff = p
+	while m != cutoff
+		remove m from children(parent(m))
+		add m to U
+		m = parent(m)
+
+Let (x_0, ..., x_n = m) be the maximal chain of descent of m within S.
+* If it contains some elements of U, let x_k be the last one of those.
+Then union of U with {x_{k+1}, ..., x_n} is obviously non-revealing.
+* otherwise if all its elements are locked, then none of {x_0, ..., x_n}
+may be elements of a non-revealing subset of S.
+* otherwise let x_k be the first unlocked element of the chain.  Then none
+of {x_0, ..., x_{k-1}} may be an element of a non-revealing subset of
+S and union of U and {x_k, ..., x_n} is non-revealing.
+
+handle_locked(m) finds which of these cases applies and adjusts Candidates
+and U accordingly.  U remains non-revealing, union of Candidates and
+U still contains any non-revealing subset of S and after the call of
+handle_locked(m) m is guaranteed to be not in Candidates list.  So having
+it called for each element of S would suffice to empty Candidates,
+leaving U the maximal non-revealing subset of S.
+
+However, handle_locked(m) is a no-op when m belongs to U, so it's enough
+to have it called for elements of Candidates list until none remain.
+
+Time complexity: number of calls of handle_locked() is limited by
+#Candidates, each iteration of the first loop in handle_locked() removes
+an element from the list, so their total number of executions is also
+limited by #Candidates; number of iterations in the second loop is no
+greater than the number of iterations of the first loop.
+
+
+		Reparenting
+
+After we'd calculated the final set, we still need to deal with
+reparenting - if an element of the final set has a child not in it,
+we need to reparent such child.
+
+Such children can only be root-overmounting (otherwise the set wouldn't
+be non-shifting) and their parents can not belong to the original set,
+since the original is guaranteed to be closed.
+
+
+		Putting all of that together
+
+The plan is to
+	* find all candidates
+	* trim down to maximal non-shifting subset
+	* trim down to maximal non-revealing subset
+	* reparent anything that needs to be reparented
+	* return the resulting set to the caller
+
+For the 2nd and 3rd steps we want to separate the set into growing
+non-revealing subset, initially containing the original set ("U" in
+terms of the pseudocode above) and everything we are still not sure about
+("candidates").  It means that for the output of the 1st step we'd like
+the extra candidates separated from the stuff already in the original set.
+For the 4th step we would like the additions to U separate from the
+original set.
+
+So let's go for
+	* original set ("set").  Linkage via mnt_list
+	* undecided candidates ("candidates").  Subset of a list,
+consisting of all its elements marked with a new flag (MNT_UMOUNT_CANDIDATE).
+Initially all elements of the list will be marked that way; in the
+end the list will become empty and no mounts will remain marked with
+that flag.
+	* Reuse MNT_MARKED for "has been already seen by trim_ancestors()".
+	* anything in U that hadn't been in the original set - elements of
+candidates will gradually be either discarded or moved there.  In other
+words, it's the candidates we have already decided to unmount.	Its role
+is reasonably close to the old "to_umount", so let's use that name.
+Linkage via mnt_list.
+
+For gather_candidates() we'll need to maintain both candidates (S -
+set) and intersection of S with set.  Use MNT_UMOUNT_CANDIDATE for
+all elements we encounter, putting the ones not already in the original
+set into the list of candidates.  When we are done, strip that flag from
+all elements of the original set.  That gives a cheap way to check
+if element belongs to S (in gather_candidates) and to candidates
+itself (at later stages).  Call that predicate is_candidate(); it would
+be m->mnt_flags & MNT_UMOUNT_CANDIDATE.
+
+All elements of the original set are marked with MNT_UMOUNT and we'll
+need the same for elements added when joining the contents of to_umount
+to set in the end.  Let's set MNT_UMOUNT at the time we add an element
+to to_umount; that's close to what the old 'umount_one' is doing, so
+let's keep that name.  It also gives us another predicate we need -
+"belongs to union of set and to_umount"; will_be_unmounted() for now.
+
+Removals from the candidates list should strip both MNT_MARKED and
+MNT_UMOUNT_CANDIDATE; call it remove_from_candidates_list().
diff --git a/fs/mount.h b/fs/mount.h
index f20e6ed845fe..fb93d3e16724 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -79,7 +79,6 @@ struct mount {
 		struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
 		struct hlist_node mnt_umount;
 	};
-	struct list_head mnt_umounting; /* list entry for umount propagation */
 #ifdef CONFIG_FSNOTIFY
 	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
diff --git a/fs/namespace.c b/fs/namespace.c
index 6a0697eeda74..f64895d47d70 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -383,7 +383,6 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
-		INIT_LIST_HEAD(&mnt->mnt_umounting);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
 		RB_CLEAR_NODE(&mnt->mnt_node);
 		mnt->mnt.mnt_idmap = &nop_mnt_idmap;
diff --git a/fs/pnode.c b/fs/pnode.c
index aa187144e389..901d40946d34 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -24,11 +24,6 @@ static inline struct mount *first_slave(struct mount *p)
 	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
 }
 
-static inline struct mount *last_slave(struct mount *p)
-{
-	return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
-}
-
 static inline struct mount *next_slave(struct mount *p)
 {
 	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
@@ -136,6 +131,23 @@ void change_mnt_propagation(struct mount *mnt, int type)
 	}
 }
 
+static struct mount *__propagation_next(struct mount *m,
+					 struct mount *origin)
+{
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			return (next == origin) ? NULL : next;
+		} else if (m->mnt_slave.next != &master->mnt_slave_list)
+			return next_slave(m);
+
+		/* back at master */
+		m = master;
+	}
+}
+
 /*
  * get the next mount in the propagation tree.
  * @m: the mount seen last
@@ -153,31 +165,21 @@ static struct mount *propagation_next(struct mount *m,
 	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
 		return first_slave(m);
 
-	while (1) {
-		struct mount *master = m->mnt_master;
-
-		if (master == origin->mnt_master) {
-			struct mount *next = next_peer(m);
-			return (next == origin) ? NULL : next;
-		} else if (m->mnt_slave.next != &master->mnt_slave_list)
-			return next_slave(m);
-
-		/* back at master */
-		m = master;
-	}
+	return __propagation_next(m, origin);
 }
 
 static struct mount *skip_propagation_subtree(struct mount *m,
 						struct mount *origin)
 {
 	/*
-	 * Advance m such that propagation_next will not return
-	 * the slaves of m.
+	 * Advance m past everything that gets propagation from it.
 	 */
-	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
-		m = last_slave(m);
+	struct mount *p = __propagation_next(m, origin);
 
-	return m;
+	while (p && peers(m, p))
+		p = __propagation_next(p, origin);
+
+	return p;
 }
 
 static struct mount *next_group(struct mount *m, struct mount *origin)
@@ -458,181 +460,213 @@ void propagate_mount_unlock(struct mount *mnt)
 	}
 }
 
-static void umount_one(struct mount *mnt, struct list_head *to_umount)
+static inline bool is_candidate(struct mount *m)
 {
-	CLEAR_MNT_MARK(mnt);
-	mnt->mnt.mnt_flags |= MNT_UMOUNT;
-	list_del_init(&mnt->mnt_child);
-	list_del_init(&mnt->mnt_umounting);
-	move_from_ns(mnt, to_umount);
+	return m->mnt.mnt_flags & MNT_UMOUNT_CANDIDATE;
+}
+
+static inline bool will_be_unmounted(struct mount *m)
+{
+	return m->mnt.mnt_flags & MNT_UMOUNT;
+}
+
+static void umount_one(struct mount *m, struct list_head *to_umount)
+{
+	m->mnt.mnt_flags |= MNT_UMOUNT;
+	list_del_init(&m->mnt_child);
+	move_from_ns(m, to_umount);
+}
+
+static void remove_from_candidate_list(struct mount *m)
+{
+	m->mnt.mnt_flags &= ~(MNT_MARKED | MNT_UMOUNT_CANDIDATE);
+	list_del_init(&m->mnt_list);
+}
+
+static void gather_candidates(struct list_head *set,
+			      struct list_head *candidates)
+{
+	struct mount *m, *p, *q;
+
+	list_for_each_entry(m, set, mnt_list) {
+		if (is_candidate(m))
+			continue;
+		m->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
+		p = m->mnt_parent;
+		q = propagation_next(p, p);
+		while (q) {
+			struct mount *child = __lookup_mnt(&q->mnt,
+							   m->mnt_mountpoint);
+			if (child) {
+				/*
+				 * We might've already run into this one.  That
+				 * must've happened on earlier iteration of the
+				 * outer loop; in that case we can skip those
+				 * parents that get propagation from q - there
+				 * will be nothing new on those as well.
+				 */
+				if (is_candidate(child)) {
+					q = skip_propagation_subtree(q, p);
+					continue;
+				}
+				child->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
+				if (!will_be_unmounted(child))
+					list_add(&child->mnt_list, candidates);
+			}
+			q = propagation_next(q, p);
+		}
+	}
+	list_for_each_entry(m, set, mnt_list)
+		m->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
 }
 
 /*
- * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
- * parent propagates to.
+ * We know that some child of @m can't be unmounted.  In all places where the
+ * chain of descent of @m has child not overmounting the root of parent,
+ * the parent can't be unmounted either.
  */
-static bool __propagate_umount(struct mount *mnt,
-			       struct list_head *to_umount,
-			       struct list_head *to_restore)
+static void trim_ancestors(struct mount *m)
 {
-	bool progress = false;
-	struct mount *child;
+	struct mount *p;
 
-	/*
-	 * The state of the parent won't change if this mount is
-	 * already unmounted or marked as without children.
-	 */
-	if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
-		goto out;
-
-	/* Verify topper is the only grandchild that has not been
-	 * speculatively unmounted.
-	 */
-	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
-		if (child->mnt_mountpoint == mnt->mnt.mnt_root)
-			continue;
-		if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
-			continue;
-		/* Found a mounted child */
-		goto children;
-	}
-
-	/* Mark mounts that can be unmounted if not locked */
-	SET_MNT_MARK(mnt);
-	progress = true;
-
-	/* If a mount is without children and not locked umount it. */
-	if (!IS_MNT_LOCKED(mnt)) {
-		umount_one(mnt, to_umount);
-	} else {
-children:
-		list_move_tail(&mnt->mnt_umounting, to_restore);
-	}
-out:
-	return progress;
-}
-
-static void umount_list(struct list_head *to_umount,
-			struct list_head *to_restore)
-{
-	struct mount *mnt, *child, *tmp;
-	list_for_each_entry(mnt, to_umount, mnt_list) {
-		list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
-			/* topper? */
-			if (child->mnt_mountpoint == mnt->mnt.mnt_root)
-				list_move_tail(&child->mnt_umounting, to_restore);
-			else
-				umount_one(child, to_umount);
-		}
-	}
-}
-
-static void restore_mounts(struct list_head *to_restore)
-{
-	/* Restore mounts to a clean working state */
-	while (!list_empty(to_restore)) {
-		struct mount *mnt, *parent;
-		struct mountpoint *mp;
-
-		mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
-		CLEAR_MNT_MARK(mnt);
-		list_del_init(&mnt->mnt_umounting);
-
-		/* Should this mount be reparented? */
-		mp = mnt->mnt_mp;
-		parent = mnt->mnt_parent;
-		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
-			mp = parent->mnt_mp;
-			parent = parent->mnt_parent;
-		}
-		if (parent != mnt->mnt_parent) {
-			mnt_change_mountpoint(parent, mp, mnt);
-			mnt_notify_add(mnt);
-		}
-	}
-}
-
-static void cleanup_umount_visitations(struct list_head *visited)
-{
-	while (!list_empty(visited)) {
-		struct mount *mnt =
-			list_first_entry(visited, struct mount, mnt_umounting);
-		list_del_init(&mnt->mnt_umounting);
+	for (p = m->mnt_parent; is_candidate(p); m = p, p = p->mnt_parent) {
+		if (IS_MNT_MARKED(m))	// all candidates beneath are overmounts
+			return;
+		SET_MNT_MARK(m);
+		if (m != p->overmount)
+			p->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
 	}
 }
 
 /*
- * collect all mounts that receive propagation from the mount in @list,
- * and return these additional mounts in the same list.
- * @list: the list of mounts to be unmounted.
+ * Find and exclude all umount candidates forbidden by @m
+ * (see Documentation/filesystems/propagate_umount.txt)
+ * If we can immediately tell that @m is OK to unmount (unlocked
+ * and all children are already committed to unmounting) commit
+ * to unmounting it.
+ * Only @m itself might be taken from the candidates list;
+ * anything found by trim_ancestors() is marked non-candidate
+ * and left on the list.
+ */
+static void trim_one(struct mount *m, struct list_head *to_umount)
+{
+	bool remove_this = false, found = false, umount_this = false;
+	struct mount *n;
+
+	if (!is_candidate(m)) { // trim_ancestors() left it on list
+		remove_from_candidate_list(m);
+		return;
+	}
+
+	list_for_each_entry(n, &m->mnt_mounts, mnt_child) {
+		if (!is_candidate(n)) {
+			found = true;
+			if (n != m->overmount) {
+				remove_this = true;
+				break;
+			}
+		}
+	}
+	if (found) {
+		trim_ancestors(m);
+	} else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) {
+		remove_this = true;
+		umount_this = true;
+	}
+	if (remove_this) {
+		remove_from_candidate_list(m);
+		if (umount_this)
+			umount_one(m, to_umount);
+	}
+}
+
+static void handle_locked(struct mount *m, struct list_head *to_umount)
+{
+	struct mount *cutoff = m, *p;
+
+	if (!is_candidate(m)) { // trim_ancestors() left it on list
+		remove_from_candidate_list(m);
+		return;
+	}
+	for (p = m; is_candidate(p); p = p->mnt_parent) {
+		remove_from_candidate_list(p);
+		if (!IS_MNT_LOCKED(p))
+			cutoff = p->mnt_parent;
+	}
+	if (will_be_unmounted(p))
+		cutoff = p;
+	while (m != cutoff) {
+		umount_one(m, to_umount);
+		m = m->mnt_parent;
+	}
+}
+
+/*
+ * @m is not to going away, and it overmounts the top of a stack of mounts
+ * that are going away.  We know that all of those are fully overmounted
+ * by the one above (@m being the topmost of the chain), so @m can be slid
+ * in place where the bottom of the stack is attached.
  *
- * vfsmount lock must be held for write
+ * NOTE: here we temporarily violate a constraint - two mounts end up with
+ * the same parent and mountpoint; that will be remedied as soon as we
+ * return from propagate_umount() - its caller (umount_tree()) will detach
+ * the stack from the parent it (and now @m) is attached to.  umount_tree()
+ * might choose to keep unmounted pieces stuck to each other, but it always
+ * detaches them from the mounts that remain in the tree.
  */
-int propagate_umount(struct list_head *list)
+static void reparent(struct mount *m)
 {
-	struct mount *mnt;
-	LIST_HEAD(to_restore);
-	LIST_HEAD(to_umount);
-	LIST_HEAD(visited);
+	struct mount *p = m;
+	struct mountpoint *mp;
 
-	/* Find candidates for unmounting */
-	list_for_each_entry_reverse(mnt, list, mnt_list) {
-		struct mount *parent = mnt->mnt_parent;
-		struct mount *m;
+	do {
+		mp = p->mnt_mp;
+		p = p->mnt_parent;
+	} while (will_be_unmounted(p));
 
-		/*
-		 * If this mount has already been visited it is known that it's
-		 * entire peer group and all of their slaves in the propagation
-		 * tree for the mountpoint has already been visited and there is
-		 * no need to visit them again.
-		 */
-		if (!list_empty(&mnt->mnt_umounting))
-			continue;
+	mnt_change_mountpoint(p, mp, m);
+	mnt_notify_add(m);
+}
 
-		list_add_tail(&mnt->mnt_umounting, &visited);
-		for (m = propagation_next(parent, parent); m;
-		     m = propagation_next(m, parent)) {
-			struct mount *child = __lookup_mnt(&m->mnt,
-							   mnt->mnt_mountpoint);
-			if (!child)
-				continue;
+/**
+ * propagate_umount - apply propagation rules to the set of mounts for umount()
+ * @set: the list of mounts to be unmounted.
+ *
+ * Collect all mounts that receive propagation from the mount in @set and have
+ * no obstacles to being unmounted.  Add these additional mounts to the set.
+ *
+ * See Documentation/filesystems/propagate_umount.txt if you do anything in
+ * this area.
+ *
+ * Locks held:
+ * mount_lock (write_seqlock), namespace_sem (exclusive).
+ */
+void propagate_umount(struct list_head *set)
+{
+	struct mount *m, *p;
+	LIST_HEAD(to_umount);	// committed to unmounting
+	LIST_HEAD(candidates);	// undecided umount candidates
 
-			if (!list_empty(&child->mnt_umounting)) {
-				/*
-				 * If the child has already been visited it is
-				 * know that it's entire peer group and all of
-				 * their slaves in the propgation tree for the
-				 * mountpoint has already been visited and there
-				 * is no need to visit this subtree again.
-				 */
-				m = skip_propagation_subtree(m, parent);
-				continue;
-			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
-				/*
-				 * We have come across a partially unmounted
-				 * mount in a list that has not been visited
-				 * yet. Remember it has been visited and
-				 * continue about our merry way.
-				 */
-				list_add_tail(&child->mnt_umounting, &visited);
-				continue;
-			}
+	// collect all candidates
+	gather_candidates(set, &candidates);
 
-			/* Check the child and parents while progress is made */
-			while (__propagate_umount(child,
-						  &to_umount, &to_restore)) {
-				/* Is the parent a umount candidate? */
-				child = child->mnt_parent;
-				if (list_empty(&child->mnt_umounting))
-					break;
-			}
-		}
+	// reduce the set until it's non-shifting
+	list_for_each_entry_safe(m, p, &candidates, mnt_list)
+		trim_one(m, &to_umount);
+
+	// ... and non-revealing
+	while (!list_empty(&candidates)) {
+		m = list_first_entry(&candidates,struct mount, mnt_list);
+		handle_locked(m, &to_umount);
 	}
 
-	umount_list(&to_umount, &to_restore);
-	restore_mounts(&to_restore);
-	cleanup_umount_visitations(&visited);
-	list_splice_tail(&to_umount, list);
+	// now to_umount consists of all acceptable candidates
+	// deal with reparenting of remaining overmounts on those
+	list_for_each_entry(m, &to_umount, mnt_list) {
+		if (m->overmount)
+			reparent(m->overmount);
+	}
 
-	return 0;
+	// and fold them into the set
+	list_splice_tail_init(&to_umount, set);
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index 93fa9311bd07..04f1ac53aa49 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -42,7 +42,7 @@ static inline bool peers(const struct mount *m1, const struct mount *m2)
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 		struct hlist_head *);
-int propagate_umount(struct list_head *);
+void propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
 void propagate_mount_unlock(struct mount *);
 void mnt_release_group_id(struct mount *);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c145820fcbbf..65fa8442c00a 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -40,6 +40,7 @@ enum mount_flags {
 
 	MNT_INTERNAL	= 0x4000,
 
+	MNT_UMOUNT_CANDIDATE	= 0x020000,
 	MNT_LOCK_ATIME		= 0x040000,
 	MNT_LOCK_NOEXEC		= 0x080000,
 	MNT_LOCK_NOSUID		= 0x100000,
@@ -66,7 +67,7 @@ enum mount_flags {
 
 	MNT_INTERNAL_FLAGS = MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL |
 			     MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED |
-			     MNT_LOCKED,
+			     MNT_LOCKED | MNT_UMOUNT_CANDIDATE,
 };
 
 struct vfsmount {

From 7c6fb47b2b6c1ffcb1cae9372b9fcf269444487a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Apr 2025 22:34:33 -0400
Subject: [PATCH 17/48] make commit_tree() usable in same-namespace move case

Once attach_recursive_mnt() has created all copies of original subtree,
it needs to put them in place(s).

Steps needed for those are slightly different:
	1) in 'move' case, original copy doesn't need any rbtree
manipulations (everything's already in the same namespace where it will
be), but it needs to be detached from the current location
	2) in 'attach' case, original may be in anon namespace; if it is,
all those mounts need to removed from their current namespace before
insertion into the target one
	3) additional copies have a couple of extra twists - in case
of cross-userns propagation we need to lock everything other the root of
subtree and in case when we end up inserting under an existing mount,
that mount needs to be found (for original copy we have it explicitly
passed by the caller).

Quite a bit of that can be unified; as the first step, make commit_tree()
helper (inserting mounts into namespace, hashing the root of subtree
and marking the namespace as updated) usable in all cases; (2) and (3)
are already using it and for (1) we only need to make the insertion of
mounts into namespace conditional.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index f64895d47d70..937c2a1825f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1172,15 +1172,17 @@ static void commit_tree(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
-	list_add_tail(&head, &mnt->mnt_list);
-	while (!list_empty(&head)) {
-		m = list_first_entry(&head, typeof(*m), mnt_list);
-		list_del(&m->mnt_list);
+	if (!mnt_ns_attached(mnt)) {
+		list_add_tail(&head, &mnt->mnt_list);
+		while (!list_empty(&head)) {
+			m = list_first_entry(&head, typeof(*m), mnt_list);
+			list_del(&m->mnt_list);
 
-		mnt_add_to_ns(n, m);
+			mnt_add_to_ns(n, m);
+		}
+		n->nr_mounts += n->pending_mounts;
+		n->pending_mounts = 0;
 	}
-	n->nr_mounts += n->pending_mounts;
-	n->pending_mounts = 0;
 
 	make_visible(mnt);
 	touch_mnt_namespace(n);
@@ -2691,12 +2693,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 
 	if (moving) {
 		unhash_mnt(source_mnt);
-		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
-		if (beneath)
-			mnt_change_mountpoint(top, smp, top_mnt);
-		make_visible(source_mnt);
 		mnt_notify_add(source_mnt);
-		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
 		if (source_mnt->mnt_ns) {
 			LIST_HEAD(head);
@@ -2706,12 +2703,13 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 				move_from_ns(p, &head);
 			list_del_init(&head);
 		}
-		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
-		if (beneath)
-			mnt_change_mountpoint(top, smp, top_mnt);
-		commit_tree(source_mnt);
 	}
 
+	mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
+	if (beneath)
+		mnt_change_mountpoint(top, smp, top_mnt);
+	commit_tree(source_mnt);
+
 	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
 		struct mount *q;
 		hlist_del_init(&child->mnt_hash);

From 96f5d2e051653f2cfe9137c5d3c7794c358ffb03 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Apr 2025 22:40:48 -0400
Subject: [PATCH 18/48] attach_recursive_mnt(): unify the
 mnt_change_mountpoint() logics

The logics used for tucking under existing mount differs for original
and copies; copies do a mount hash lookup to see if mountpoint to be is
already overmounted, while the original is told explicitly.

But the same logics that is used for copies works for the original,
at which point the only place where we get very close to eliminating
the need of passing 'beneath' flag to attach_recursive_mnt().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 937c2a1825f2..9b8d07df4aa5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2643,7 +2643,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	HLIST_HEAD(tree_list);
 	struct mnt_namespace *ns = top_mnt->mnt_ns;
 	struct mountpoint *smp;
-	struct mountpoint *secondary = NULL;
+	struct mountpoint *shorter = NULL;
 	struct mount *child, *dest_mnt, *p;
 	struct mount *top;
 	struct hlist_node *n;
@@ -2655,14 +2655,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	 * mounted beneath mounts on the same mountpoint.
 	 */
 	for (top = source_mnt; unlikely(top->overmount); top = top->overmount) {
-		if (!secondary && is_mnt_ns_file(top->mnt.mnt_root))
-			secondary = top->mnt_mp;
+		if (!shorter && is_mnt_ns_file(top->mnt.mnt_root))
+			shorter = top->mnt_mp;
 	}
 	smp = get_mountpoint(top->mnt.mnt_root);
 	if (IS_ERR(smp))
 		return PTR_ERR(smp);
-	if (!secondary)
-		secondary = smp;
 
 	/* Is there space to add these mounts to the mount namespace? */
 	if (!moving) {
@@ -2706,9 +2704,14 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	}
 
 	mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
-	if (beneath)
-		mnt_change_mountpoint(top, smp, top_mnt);
-	commit_tree(source_mnt);
+	/*
+	 * Now the original copy is in the same state as the secondaries -
+	 * its root attached to mountpoint, but not hashed and all mounts
+	 * in it are either in our namespace or in no namespace at all.
+	 * Add the original to the list of copies and deal with the
+	 * rest of work for all of them uniformly.
+	 */
+	hlist_add_head(&source_mnt->mnt_hash, &tree_list);
 
 	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
 		struct mount *q;
@@ -2719,10 +2722,13 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		q = __lookup_mnt(&child->mnt_parent->mnt,
 				 child->mnt_mountpoint);
 		if (q) {
+			struct mountpoint *mp = smp;
 			struct mount *r = child;
 			while (unlikely(r->overmount))
 				r = r->overmount;
-			mnt_change_mountpoint(r, secondary, q);
+			if (unlikely(shorter) && child != source_mnt)
+				mp = shorter;
+			mnt_change_mountpoint(r, mp, q);
 		}
 		commit_tree(child);
 	}

From 18959bf585a85c526c88ae23bd223e1afe997bd7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Apr 2025 22:49:47 -0400
Subject: [PATCH 19/48] attach_recursive_mnt(): pass destination mount in all
 cases

... and 'beneath' is no longer used there

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 9b8d07df4aa5..449e66436b4f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2570,7 +2570,7 @@ enum mnt_tree_flags_t {
 /**
  * attach_recursive_mnt - attach a source mount tree
  * @source_mnt: mount tree to be attached
- * @top_mnt:    mount that @source_mnt will be mounted on or mounted beneath
+ * @dest_mnt:   mount that @source_mnt will be mounted on
  * @dest_mp:    the mountpoint @source_mnt will be mounted at
  * @flags:      modify how @source_mnt is supposed to be attached
  *
@@ -2635,20 +2635,20 @@ enum mnt_tree_flags_t {
  *         Otherwise a negative error code is returned.
  */
 static int attach_recursive_mnt(struct mount *source_mnt,
-				struct mount *top_mnt,
+				struct mount *dest_mnt,
 				struct mountpoint *dest_mp,
 				enum mnt_tree_flags_t flags)
 {
 	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	HLIST_HEAD(tree_list);
-	struct mnt_namespace *ns = top_mnt->mnt_ns;
+	struct mnt_namespace *ns = dest_mnt->mnt_ns;
 	struct mountpoint *smp;
 	struct mountpoint *shorter = NULL;
-	struct mount *child, *dest_mnt, *p;
+	struct mount *child, *p;
 	struct mount *top;
 	struct hlist_node *n;
 	int err = 0;
-	bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;
+	bool moving = flags & MNT_TREE_MOVE;
 
 	/*
 	 * Preallocate a mountpoint in case the new mounts need to be
@@ -2669,11 +2669,6 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 			goto out;
 	}
 
-	if (beneath)
-		dest_mnt = top_mnt->mnt_parent;
-	else
-		dest_mnt = top_mnt;
-
 	if (IS_MNT_SHARED(dest_mnt)) {
 		err = invent_group_ids(source_mnt, true);
 		if (err)
@@ -3688,7 +3683,7 @@ static int do_move_mount(struct path *old_path,
 	if (mount_is_ancestor(old, p))
 		goto out;
 
-	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
+	err = attach_recursive_mnt(old, p, mp, flags);
 	if (err)
 		goto out;
 

From 86b1da96c5aeb816e4a1b60aa2b3bdcd87e28522 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Apr 2025 22:54:56 -0400
Subject: [PATCH 20/48] attach_recursive_mnt(): get rid of flags entirely

move vs. attach is trivially detected as mnt_has_parent(source_mnt)...

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 449e66436b4f..adb37f06ba68 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2562,9 +2562,8 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
 }
 
 enum mnt_tree_flags_t {
-	MNT_TREE_MOVE = BIT(0),
-	MNT_TREE_BENEATH = BIT(1),
-	MNT_TREE_PROPAGATION = BIT(2),
+	MNT_TREE_BENEATH = BIT(0),
+	MNT_TREE_PROPAGATION = BIT(1),
 };
 
 /**
@@ -2572,7 +2571,6 @@ enum mnt_tree_flags_t {
  * @source_mnt: mount tree to be attached
  * @dest_mnt:   mount that @source_mnt will be mounted on
  * @dest_mp:    the mountpoint @source_mnt will be mounted at
- * @flags:      modify how @source_mnt is supposed to be attached
  *
  *  NOTE: in the table below explains the semantics when a source mount
  *  of a given type is attached to a destination mount of a given type.
@@ -2636,8 +2634,7 @@ enum mnt_tree_flags_t {
  */
 static int attach_recursive_mnt(struct mount *source_mnt,
 				struct mount *dest_mnt,
-				struct mountpoint *dest_mp,
-				enum mnt_tree_flags_t flags)
+				struct mountpoint *dest_mp)
 {
 	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	HLIST_HEAD(tree_list);
@@ -2648,7 +2645,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	struct mount *top;
 	struct hlist_node *n;
 	int err = 0;
-	bool moving = flags & MNT_TREE_MOVE;
+	bool moving = mnt_has_parent(source_mnt);
 
 	/*
 	 * Preallocate a mountpoint in case the new mounts need to be
@@ -2871,7 +2868,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
 	      d_is_dir(mnt->mnt.mnt_root))
 		return -ENOTDIR;
 
-	return attach_recursive_mnt(mnt, p, mp, 0);
+	return attach_recursive_mnt(mnt, p, mp);
 }
 
 /*
@@ -3613,8 +3610,6 @@ static int do_move_mount(struct path *old_path,
 	p = real_mount(new_path->mnt);
 	parent = old->mnt_parent;
 	attached = mnt_has_parent(old);
-	if (attached)
-		flags |= MNT_TREE_MOVE;
 	old_mp = old->mnt_mp;
 	ns = old->mnt_ns;
 
@@ -3668,7 +3663,6 @@ static int do_move_mount(struct path *old_path,
 
 		err = -EINVAL;
 		p = p->mnt_parent;
-		flags |= MNT_TREE_BENEATH;
 	}
 
 	/*
@@ -3683,7 +3677,7 @@ static int do_move_mount(struct path *old_path,
 	if (mount_is_ancestor(old, p))
 		goto out;
 
-	err = attach_recursive_mnt(old, p, mp, flags);
+	err = attach_recursive_mnt(old, p, mp);
 	if (err)
 		goto out;
 

From 761de25854424aa23fd1d7b2bef5c7d184690926 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Apr 2025 12:55:39 -0400
Subject: [PATCH 21/48] do_move_mount(): take dropping the old mountpoint into
 attach_recursive_mnt()

... and fold it with unhash_mnt() there - there's no need to retain a reference
to old_mp beyond that point, since by then all mountpoints we were going to add
are either explicitly pinned by get_mountpoint() or have stuff already added
to them.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index adb37f06ba68..e5f8fde57c99 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2682,7 +2682,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	}
 
 	if (moving) {
-		unhash_mnt(source_mnt);
+		umount_mnt(source_mnt);
 		mnt_notify_add(source_mnt);
 	} else {
 		if (source_mnt->mnt_ns) {
@@ -3598,7 +3598,7 @@ static int do_move_mount(struct path *old_path,
 	struct mount *p;
 	struct mount *old;
 	struct mount *parent;
-	struct mountpoint *mp, *old_mp;
+	struct mountpoint *mp;
 	int err;
 	bool attached, beneath = flags & MNT_TREE_BENEATH;
 
@@ -3610,7 +3610,6 @@ static int do_move_mount(struct path *old_path,
 	p = real_mount(new_path->mnt);
 	parent = old->mnt_parent;
 	attached = mnt_has_parent(old);
-	old_mp = old->mnt_mp;
 	ns = old->mnt_ns;
 
 	err = -EINVAL;
@@ -3684,8 +3683,6 @@ static int do_move_mount(struct path *old_path,
 	/* if the mount is moved, it should no longer be expire
 	 * automatically */
 	list_del_init(&old->mnt_expire);
-	if (attached)
-		put_mountpoint(old_mp);
 out:
 	unlock_mount(mp);
 	if (!err) {

From ee1ee33ccc1ba0620a77833b2a3e320588701217 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 8 May 2025 00:09:30 -0400
Subject: [PATCH 22/48] do_move_mount(): get rid of 'attached' flag

'attached' serves as a proxy for "source is a subtree of our namespace
and not the entirety of anon namespace"; finish massaging it away.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index e5f8fde57c99..7c7cc14da1ee 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3600,7 +3600,7 @@ static int do_move_mount(struct path *old_path,
 	struct mount *parent;
 	struct mountpoint *mp;
 	int err;
-	bool attached, beneath = flags & MNT_TREE_BENEATH;
+	bool beneath = flags & MNT_TREE_BENEATH;
 
 	mp = do_lock_mount(new_path, beneath);
 	if (IS_ERR(mp))
@@ -3609,7 +3609,6 @@ static int do_move_mount(struct path *old_path,
 	old = real_mount(old_path->mnt);
 	p = real_mount(new_path->mnt);
 	parent = old->mnt_parent;
-	attached = mnt_has_parent(old);
 	ns = old->mnt_ns;
 
 	err = -EINVAL;
@@ -3622,6 +3621,9 @@ static int do_move_mount(struct path *old_path,
 		/* ... and the target should be in our namespace */
 		if (!check_mnt(p))
 			goto out;
+		/* parent of the source should not be shared */
+		if (IS_MNT_SHARED(parent))
+			goto out;
 	} else {
 		/*
 		 * otherwise the source must be the root of some anon namespace.
@@ -3649,11 +3651,6 @@ static int do_move_mount(struct path *old_path,
 	if (d_is_dir(new_path->dentry) !=
 	    d_is_dir(old_path->dentry))
 		goto out;
-	/*
-	 * Don't move a mount residing in a shared parent.
-	 */
-	if (attached && IS_MNT_SHARED(parent))
-		goto out;
 
 	if (beneath) {
 		err = can_move_mount_beneath(old_path, new_path, mp);
@@ -3686,7 +3683,7 @@ static int do_move_mount(struct path *old_path,
 out:
 	unlock_mount(mp);
 	if (!err) {
-		if (attached) {
+		if (!is_anon_ns(ns)) {
 			mntput_no_expire(parent);
 		} else {
 			/* Make sure we notice when we leak mounts. */

From a8c764e1a580c2128e905ad6e42beef413fb7177 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 May 2025 19:59:30 -0400
Subject: [PATCH 23/48] attach_recursive_mnt(): remove from expiry list on move

... rather than doing that in do_move_mount().  That's the main
obstacle to moving the protection of ->mnt_expire from namespace_sem
to mount_lock (spinlock-only), which would simplify several failure
exits.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 7c7cc14da1ee..e8dc8af87548 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2684,6 +2684,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	if (moving) {
 		umount_mnt(source_mnt);
 		mnt_notify_add(source_mnt);
+		/* if the mount is moved, it should no longer be expired
+		 * automatically */
+		list_del_init(&source_mnt->mnt_expire);
 	} else {
 		if (source_mnt->mnt_ns) {
 			LIST_HEAD(head);
@@ -3674,12 +3677,6 @@ static int do_move_mount(struct path *old_path,
 		goto out;
 
 	err = attach_recursive_mnt(old, p, mp);
-	if (err)
-		goto out;
-
-	/* if the mount is moved, it should no longer be expire
-	 * automatically */
-	list_del_init(&old->mnt_expire);
 out:
 	unlock_mount(mp);
 	if (!err) {

From ec3265a245b22f8b8a0b20e04dd1d3f4f4a9ce09 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 May 2025 20:40:57 -0400
Subject: [PATCH 24/48] take ->mnt_expire handling under mount_lock
 [read_seqlock_excl]

Doesn't take much massage, and we no longer need to make sure that
by the time of final mntput() the victim has been removed from the
list.  Makes life safer for ->d_automount() instances...

Rules:
	* all ->mnt_expire accesses are under mount_lock.
	* insertion into the list is done by mnt_set_expiry(), and
caller (->d_automount() instance) must hold a reference to mount
in question.  It shouldn't be done more than once for a mount.
	* if a mount on an expiry list is not yet mounted, it will
be ignored by anything that walks that list.
	* if the final mntput() finds its victim still on an expiry
list (in which case it must've never been mounted - umount_tree()
would've taken it out), it will remove the victim from the list.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index e8dc8af87548..ff2281f780dc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1353,13 +1353,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 		list_add(&mnt->mnt_slave, &old->mnt_slave);
 		mnt->mnt_master = old->mnt_master;
 	}
-
-	/* stick the duplicate mount on the same expiry list
-	 * as the original if that was on one */
-	if (flag & CL_EXPIRE) {
-		if (!list_empty(&old->mnt_expire))
-			list_add(&mnt->mnt_expire, &old->mnt_expire);
-	}
 	return mnt;
 
  out_free:
@@ -1452,6 +1445,8 @@ static void mntput_no_expire(struct mount *mnt)
 	rcu_read_unlock();
 
 	list_del(&mnt->mnt_instance);
+	if (unlikely(!list_empty(&mnt->mnt_expire)))
+		list_del(&mnt->mnt_expire);
 
 	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
 		struct mount *p, *tmp;
@@ -2273,6 +2268,13 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
 			lock_mount_hash();
 			if (src_mnt->mnt.mnt_flags & MNT_LOCKED)
 				dst_mnt->mnt.mnt_flags |= MNT_LOCKED;
+			if (unlikely(flag & CL_EXPIRE)) {
+				/* stick the duplicate mount on the same expiry
+				 * list as the original if that was on one */
+				if (!list_empty(&src_mnt->mnt_expire))
+					list_add(&dst_mnt->mnt_expire,
+						 &src_mnt->mnt_expire);
+			}
 			list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
 			attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp);
 			unlock_mount_hash();
@@ -3891,12 +3893,6 @@ discard_locked:
 	namespace_unlock();
 	inode_unlock(dentry->d_inode);
 discard:
-	/* remove m from any expiration list it may be on */
-	if (!list_empty(&mnt->mnt_expire)) {
-		namespace_lock();
-		list_del_init(&mnt->mnt_expire);
-		namespace_unlock();
-	}
 	mntput(m);
 	return err;
 }
@@ -3908,11 +3904,9 @@ discard:
  */
 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 {
-	namespace_lock();
-
+	read_seqlock_excl(&mount_lock);
 	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
-
-	namespace_unlock();
+	read_sequnlock_excl(&mount_lock);
 }
 EXPORT_SYMBOL(mnt_set_expiry);
 

From e30da2a20e31ab958d41e5a2c764968d95f17b61 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Apr 2025 16:53:01 -0400
Subject: [PATCH 25/48] pivot_root(): reorder tree surgeries, collapse
 unhash_mnt() and put_mountpoint()

attach new_mnt *before* detaching root_mnt; that way we don't need to keep hold
on the mountpoint and one more pair of unhash_mnt()/put_mountpoint() gets
folded together into umount_mnt().

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index ff2281f780dc..eee73e945a54 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4685,7 +4685,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 {
 	struct path new, old, root;
 	struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
-	struct mountpoint *old_mp, *root_mp;
+	struct mountpoint *old_mp;
 	int error;
 
 	if (!may_mount())
@@ -4748,20 +4748,19 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		goto out4;
 	lock_mount_hash();
 	umount_mnt(new_mnt);
-	root_mp = unhash_mnt(root_mnt);  /* we'll need its mountpoint */
 	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
 		new_mnt->mnt.mnt_flags |= MNT_LOCKED;
 		root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
 	}
+	/* mount new_root on / */
+	attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp);
+	umount_mnt(root_mnt);
+	mnt_add_count(root_parent, -1);
 	/* mount old root on put_old */
 	attach_mnt(root_mnt, old_mnt, old_mp);
-	/* mount new_root on / */
-	attach_mnt(new_mnt, root_parent, root_mp);
-	mnt_add_count(root_parent, -1);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	/* A moved mount should not expire automatically */
 	list_del_init(&new_mnt->mnt_expire);
-	put_mountpoint(root_mp);
 	unlock_mount_hash();
 	mnt_notify_add(root_mnt);
 	mnt_notify_add(new_mnt);

From 86f63980964b334ad49d5c1f132f3b9491303a15 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Apr 2025 17:24:10 -0400
Subject: [PATCH 26/48] combine __put_mountpoint() with unhash_mnt()

A call of unhash_mnt() is immediately followed by passing its return
value to __put_mountpoint(); the shrink list given to __put_mountpoint()
will be ex_mountpoints when called from umount_mnt() and list when called
from mntput_no_expire().

Replace with __umount_mnt(mount, shrink_list), moving the call of
__put_mountpoint() into it (and returning nothing), adjust the
callers.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index eee73e945a54..521ffa52c906 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1037,9 +1037,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 }
 
 /*
- * vfsmount lock must be held for write
+ * locks: mount_lock[write_seqlock]
  */
-static struct mountpoint *unhash_mnt(struct mount *mnt)
+static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list)
 {
 	struct mountpoint *mp;
 	struct mount *parent = mnt->mnt_parent;
@@ -1052,15 +1052,15 @@ static struct mountpoint *unhash_mnt(struct mount *mnt)
 	hlist_del_init(&mnt->mnt_mp_list);
 	mp = mnt->mnt_mp;
 	mnt->mnt_mp = NULL;
-	return mp;
+	__put_mountpoint(mp, shrink_list);
 }
 
 /*
- * vfsmount lock must be held for write
+ * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints)
  */
 static void umount_mnt(struct mount *mnt)
 {
-	put_mountpoint(unhash_mnt(mnt));
+	__umount_mnt(mnt, &ex_mountpoints);
 }
 
 /*
@@ -1451,7 +1451,7 @@ static void mntput_no_expire(struct mount *mnt)
 	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
 		struct mount *p, *tmp;
 		list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
-			__put_mountpoint(unhash_mnt(p), &list);
+			__umount_mnt(p, &list);
 			hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
 		}
 	}

From d72c773237c0472e214cda92016ad21625b05bba Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Apr 2025 20:21:23 -0400
Subject: [PATCH 27/48] get rid of mountpoint->m_count

struct mountpoint has an odd kinda-sorta refcount in it.  It's always
either equal to or one above the number of mounts attached to that
mountpoint.

"One above" happens when a function takes a temporary reference to
mountpoint.  Things get simpler if we express that as inserting
a local object into ->m_list and removing it to drop the reference.

New calling conventions:

1) lock_mount(), do_lock_mount(), get_mountpoint() and lookup_mountpoint()
take an extra struct pinned_mountpoint * argument and returns 0/-E...
(or true/false in case of lookup_mountpoint()) instead of returning
struct mountpoint pointers.  In case of success, the struct mountpoint *
we used to get can be found as pinned_mountpoint.mp

2) unlock_mount() (always paired with lock_mount()/do_lock_mount()) takes
an address of struct pinned_mountpoint - the same that had been passed to
lock_mount()/do_lock_mount().

3) put_mountpoint() for a temporary reference (paired with get_mountpoint()
or lookup_mountpoint()) is replaced with unpin_mountpoint(), which takes
the address of pinned_mountpoint we passed to matching {get,lookup}_mountpoint().

4) all instances of pinned_mountpoint are local variables; they always live on
stack.  {} is used for initializer, after successful {get,lookup}_mountpoint()
we must make sure to call unpin_mountpoint() before leaving the scope and
after successful {do_,}lock_mount() we must make sure to call unlock_mount()
before leaving the scope.

5) all manipulations of ->m_count are gone, along with ->m_count itself.
struct mountpoint lives while its ->m_list is non-empty.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mount.h     |   1 -
 fs/namespace.c | 186 ++++++++++++++++++++++++-------------------------
 2 files changed, 92 insertions(+), 95 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index fb93d3e16724..4355c482a841 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -44,7 +44,6 @@ struct mountpoint {
 	struct hlist_node m_hash;
 	struct dentry *m_dentry;
 	struct hlist_head m_list;
-	int m_count;
 };
 
 struct mount {
diff --git a/fs/namespace.c b/fs/namespace.c
index 521ffa52c906..6df0436bfcb9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -910,42 +910,48 @@ bool __is_local_mountpoint(const struct dentry *dentry)
 	return is_covered;
 }
 
-static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
+struct pinned_mountpoint {
+	struct hlist_node node;
+	struct mountpoint *mp;
+};
+
+static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
 {
 	struct hlist_head *chain = mp_hash(dentry);
 	struct mountpoint *mp;
 
 	hlist_for_each_entry(mp, chain, m_hash) {
 		if (mp->m_dentry == dentry) {
-			mp->m_count++;
-			return mp;
+			hlist_add_head(&m->node, &mp->m_list);
+			m->mp = mp;
+			return true;
 		}
 	}
-	return NULL;
+	return false;
 }
 
-static struct mountpoint *get_mountpoint(struct dentry *dentry)
+static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
 {
-	struct mountpoint *mp, *new = NULL;
+	struct mountpoint *mp __free(kfree) = NULL;
+	bool found;
 	int ret;
 
 	if (d_mountpoint(dentry)) {
 		/* might be worth a WARN_ON() */
 		if (d_unlinked(dentry))
-			return ERR_PTR(-ENOENT);
+			return -ENOENT;
 mountpoint:
 		read_seqlock_excl(&mount_lock);
-		mp = lookup_mountpoint(dentry);
+		found = lookup_mountpoint(dentry, m);
 		read_sequnlock_excl(&mount_lock);
-		if (mp)
-			goto done;
+		if (found)
+			return 0;
 	}
 
-	if (!new)
-		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
-	if (!new)
-		return ERR_PTR(-ENOMEM);
-
+	if (!mp)
+		mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+	if (!mp)
+		return -ENOMEM;
 
 	/* Exactly one processes may set d_mounted */
 	ret = d_set_mounted(dentry);
@@ -955,34 +961,28 @@ mountpoint:
 		goto mountpoint;
 
 	/* The dentry is not available as a mountpoint? */
-	mp = ERR_PTR(ret);
 	if (ret)
-		goto done;
+		return ret;
 
 	/* Add the new mountpoint to the hash table */
 	read_seqlock_excl(&mount_lock);
-	new->m_dentry = dget(dentry);
-	new->m_count = 1;
-	hlist_add_head(&new->m_hash, mp_hash(dentry));
-	INIT_HLIST_HEAD(&new->m_list);
+	mp->m_dentry = dget(dentry);
+	hlist_add_head(&mp->m_hash, mp_hash(dentry));
+	INIT_HLIST_HEAD(&mp->m_list);
+	hlist_add_head(&m->node, &mp->m_list);
+	m->mp = no_free_ptr(mp);
 	read_sequnlock_excl(&mount_lock);
-
-	mp = new;
-	new = NULL;
-done:
-	kfree(new);
-	return mp;
+	return 0;
 }
 
 /*
  * vfsmount lock must be held.  Additionally, the caller is responsible
  * for serializing calls for given disposal list.
  */
-static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
+static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list)
 {
-	if (!--mp->m_count) {
+	if (hlist_empty(&mp->m_list)) {
 		struct dentry *dentry = mp->m_dentry;
-		BUG_ON(!hlist_empty(&mp->m_list));
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags &= ~DCACHE_MOUNTED;
 		spin_unlock(&dentry->d_lock);
@@ -992,10 +992,15 @@ static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
 	}
 }
 
-/* called with namespace_lock and vfsmount lock */
-static void put_mountpoint(struct mountpoint *mp)
+/*
+ * locks: mount_lock [read_seqlock_excl], namespace_sem [excl]
+ */
+static void unpin_mountpoint(struct pinned_mountpoint *m)
 {
-	__put_mountpoint(mp, &ex_mountpoints);
+	if (m->mp) {
+		hlist_del(&m->node);
+		maybe_free_mountpoint(m->mp, &ex_mountpoints);
+	}
 }
 
 static inline int check_mnt(struct mount *mnt)
@@ -1052,7 +1057,7 @@ static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list)
 	hlist_del_init(&mnt->mnt_mp_list);
 	mp = mnt->mnt_mp;
 	mnt->mnt_mp = NULL;
-	__put_mountpoint(mp, shrink_list);
+	maybe_free_mountpoint(mp, shrink_list);
 }
 
 /*
@@ -1070,7 +1075,6 @@ void mnt_set_mountpoint(struct mount *mnt,
 			struct mountpoint *mp,
 			struct mount *child_mnt)
 {
-	mp->m_count++;
 	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
 	child_mnt->mnt_mountpoint = mp->m_dentry;
 	child_mnt->mnt_parent = mnt;
@@ -1122,7 +1126,7 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
 
 	attach_mnt(mnt, parent, mp);
 
-	put_mountpoint(old_mp);
+	maybe_free_mountpoint(old_mp, &ex_mountpoints);
 	mnt_add_count(old_parent, -1);
 }
 
@@ -2030,25 +2034,24 @@ out:
  */
 void __detach_mounts(struct dentry *dentry)
 {
-	struct mountpoint *mp;
+	struct pinned_mountpoint mp = {};
 	struct mount *mnt;
 
 	namespace_lock();
 	lock_mount_hash();
-	mp = lookup_mountpoint(dentry);
-	if (!mp)
+	if (!lookup_mountpoint(dentry, &mp))
 		goto out_unlock;
 
 	event++;
-	while (!hlist_empty(&mp->m_list)) {
-		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
+	while (mp.node.next) {
+		mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list);
 		if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
 			umount_mnt(mnt);
 			hlist_add_head(&mnt->mnt_umount, &unmounted);
 		}
 		else umount_tree(mnt, UMOUNT_CONNECTED);
 	}
-	put_mountpoint(mp);
+	unpin_mountpoint(&mp);
 out_unlock:
 	unlock_mount_hash();
 	namespace_unlock();
@@ -2641,7 +2644,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	HLIST_HEAD(tree_list);
 	struct mnt_namespace *ns = dest_mnt->mnt_ns;
-	struct mountpoint *smp;
+	struct pinned_mountpoint root = {};
 	struct mountpoint *shorter = NULL;
 	struct mount *child, *p;
 	struct mount *top;
@@ -2657,9 +2660,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		if (!shorter && is_mnt_ns_file(top->mnt.mnt_root))
 			shorter = top->mnt_mp;
 	}
-	smp = get_mountpoint(top->mnt.mnt_root);
-	if (IS_ERR(smp))
-		return PTR_ERR(smp);
+	err = get_mountpoint(top->mnt.mnt_root, &root);
+	if (err)
+		return err;
 
 	/* Is there space to add these mounts to the mount namespace? */
 	if (!moving) {
@@ -2719,7 +2722,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		q = __lookup_mnt(&child->mnt_parent->mnt,
 				 child->mnt_mountpoint);
 		if (q) {
-			struct mountpoint *mp = smp;
+			struct mountpoint *mp = root.mp;
 			struct mount *r = child;
 			while (unlikely(r->overmount))
 				r = r->overmount;
@@ -2729,7 +2732,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		}
 		commit_tree(child);
 	}
-	put_mountpoint(smp);
+	unpin_mountpoint(&root);
 	unlock_mount_hash();
 
 	return 0;
@@ -2746,7 +2749,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	ns->pending_mounts = 0;
 
 	read_seqlock_excl(&mount_lock);
-	put_mountpoint(smp);
+	unpin_mountpoint(&root);
 	read_sequnlock_excl(&mount_lock);
 
 	return err;
@@ -2786,12 +2789,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,
  * Return: Either the target mountpoint on the top mount or the top
  *         mount's mountpoint.
  */
-static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
+static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath)
 {
 	struct vfsmount *mnt = path->mnt;
 	struct dentry *dentry;
-	struct mountpoint *mp = ERR_PTR(-ENOENT);
 	struct path under = {};
+	int err = -ENOENT;
 
 	for (;;) {
 		struct mount *m = real_mount(mnt);
@@ -2829,8 +2832,8 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
 			path->dentry = dget(mnt->mnt_root);
 			continue;	// got overmounted
 		}
-		mp = get_mountpoint(dentry);
-		if (IS_ERR(mp))
+		err = get_mountpoint(dentry, pinned);
+		if (err)
 			break;
 		if (beneath) {
 			/*
@@ -2841,25 +2844,25 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
 			 */
 			path_put(&under);
 		}
-		return mp;
+		return 0;
 	}
 	namespace_unlock();
 	inode_unlock(dentry->d_inode);
 	if (beneath)
 		path_put(&under);
-	return mp;
+	return err;
 }
 
-static inline struct mountpoint *lock_mount(struct path *path)
+static inline int lock_mount(struct path *path, struct pinned_mountpoint *m)
 {
-	return do_lock_mount(path, false);
+	return do_lock_mount(path, m, false);
 }
 
-static void unlock_mount(struct mountpoint *where)
+static void unlock_mount(struct pinned_mountpoint *m)
 {
-	inode_unlock(where->m_dentry->d_inode);
+	inode_unlock(m->mp->m_dentry->d_inode);
 	read_seqlock_excl(&mount_lock);
-	put_mountpoint(where);
+	unpin_mountpoint(m);
 	read_sequnlock_excl(&mount_lock);
 	namespace_unlock();
 }
@@ -3024,7 +3027,7 @@ static int do_loopback(struct path *path, const char *old_name,
 {
 	struct path old_path;
 	struct mount *mnt = NULL, *parent;
-	struct mountpoint *mp;
+	struct pinned_mountpoint mp = {};
 	int err;
 	if (!old_name || !*old_name)
 		return -EINVAL;
@@ -3036,11 +3039,9 @@ static int do_loopback(struct path *path, const char *old_name,
 	if (mnt_ns_loop(old_path.dentry))
 		goto out;
 
-	mp = lock_mount(path);
-	if (IS_ERR(mp)) {
-		err = PTR_ERR(mp);
+	err = lock_mount(path, &mp);
+	if (err)
 		goto out;
-	}
 
 	parent = real_mount(path->mnt);
 	if (!check_mnt(parent))
@@ -3052,14 +3053,14 @@ static int do_loopback(struct path *path, const char *old_name,
 		goto out2;
 	}
 
-	err = graft_tree(mnt, parent, mp);
+	err = graft_tree(mnt, parent, mp.mp);
 	if (err) {
 		lock_mount_hash();
 		umount_tree(mnt, UMOUNT_SYNC);
 		unlock_mount_hash();
 	}
 out2:
-	unlock_mount(mp);
+	unlock_mount(&mp);
 out:
 	path_put(&old_path);
 	return err;
@@ -3603,13 +3604,13 @@ static int do_move_mount(struct path *old_path,
 	struct mount *p;
 	struct mount *old;
 	struct mount *parent;
-	struct mountpoint *mp;
+	struct pinned_mountpoint mp;
 	int err;
 	bool beneath = flags & MNT_TREE_BENEATH;
 
-	mp = do_lock_mount(new_path, beneath);
-	if (IS_ERR(mp))
-		return PTR_ERR(mp);
+	err = do_lock_mount(new_path, &mp, beneath);
+	if (err)
+		return err;
 
 	old = real_mount(old_path->mnt);
 	p = real_mount(new_path->mnt);
@@ -3658,7 +3659,7 @@ static int do_move_mount(struct path *old_path,
 		goto out;
 
 	if (beneath) {
-		err = can_move_mount_beneath(old_path, new_path, mp);
+		err = can_move_mount_beneath(old_path, new_path, mp.mp);
 		if (err)
 			goto out;
 
@@ -3678,9 +3679,9 @@ static int do_move_mount(struct path *old_path,
 	if (mount_is_ancestor(old, p))
 		goto out;
 
-	err = attach_recursive_mnt(old, p, mp);
+	err = attach_recursive_mnt(old, p, mp.mp);
 out:
-	unlock_mount(mp);
+	unlock_mount(&mp);
 	if (!err) {
 		if (!is_anon_ns(ns)) {
 			mntput_no_expire(parent);
@@ -3750,7 +3751,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
 			   unsigned int mnt_flags)
 {
 	struct vfsmount *mnt;
-	struct mountpoint *mp;
+	struct pinned_mountpoint mp = {};
 	struct super_block *sb = fc->root->d_sb;
 	int error;
 
@@ -3771,13 +3772,12 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
 
 	mnt_warn_timestamp_expiry(mountpoint, mnt);
 
-	mp = lock_mount(mountpoint);
-	if (IS_ERR(mp)) {
-		mntput(mnt);
-		return PTR_ERR(mp);
+	error = lock_mount(mountpoint, &mp);
+	if (!error) {
+		error = do_add_mount(real_mount(mnt), mp.mp,
+				     mountpoint, mnt_flags);
+		unlock_mount(&mp);
 	}
-	error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
-	unlock_mount(mp);
 	if (error < 0)
 		mntput(mnt);
 	return error;
@@ -3845,7 +3845,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
 int finish_automount(struct vfsmount *m, const struct path *path)
 {
 	struct dentry *dentry = path->dentry;
-	struct mountpoint *mp;
+	struct pinned_mountpoint mp = {};
 	struct mount *mnt;
 	int err;
 
@@ -3877,14 +3877,13 @@ int finish_automount(struct vfsmount *m, const struct path *path)
 		err = 0;
 		goto discard_locked;
 	}
-	mp = get_mountpoint(dentry);
-	if (IS_ERR(mp)) {
-		err = PTR_ERR(mp);
+	err = get_mountpoint(dentry, &mp);
+	if (err)
 		goto discard_locked;
-	}
 
-	err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
-	unlock_mount(mp);
+	err = do_add_mount(mnt, mp.mp, path,
+			   path->mnt->mnt_flags | MNT_SHRINKABLE);
+	unlock_mount(&mp);
 	if (unlikely(err))
 		goto discard;
 	return 0;
@@ -4685,7 +4684,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 {
 	struct path new, old, root;
 	struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
-	struct mountpoint *old_mp;
+	struct pinned_mountpoint old_mp = {};
 	int error;
 
 	if (!may_mount())
@@ -4706,9 +4705,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		goto out2;
 
 	get_fs_root(current->fs, &root);
-	old_mp = lock_mount(&old);
-	error = PTR_ERR(old_mp);
-	if (IS_ERR(old_mp))
+	error = lock_mount(&old, &old_mp);
+	if (error)
 		goto out3;
 
 	error = -EINVAL;
@@ -4757,7 +4755,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	umount_mnt(root_mnt);
 	mnt_add_count(root_parent, -1);
 	/* mount old root on put_old */
-	attach_mnt(root_mnt, old_mnt, old_mp);
+	attach_mnt(root_mnt, old_mnt, old_mp.mp);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	/* A moved mount should not expire automatically */
 	list_del_init(&new_mnt->mnt_expire);
@@ -4767,7 +4765,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	chroot_fs_refs(&root, &new);
 	error = 0;
 out4:
-	unlock_mount(old_mp);
+	unlock_mount(&old_mp);
 	if (!error)
 		mntput_no_expire(ex_parent);
 out3:

From 493a4bebf5157a5da64e36f8d468ff80a859b563 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 27 Apr 2025 22:53:13 -0400
Subject: [PATCH 28/48] don't have mounts pin their parents

Simplify the rules for mount refcounts.  Current rules include:
	* being a namespace root => +1
	* being someone's child => +1
	* being someone's child => +1 to parent's refcount, unless you've
				   already been through umount_tree().

The last part is not needed at all.  It makes for more places where need
to decrement refcounts and it creates an asymmetry between the situations
for something that has never been a part of a namespace and something that
left one, both for no good reason.

If mount's refcount has additions from its children, we know that
	* it's either someone's child itself (and will remain so
until umount_tree(), at which point contributions from children
will disappear), or
	* or is the root of namespace (and will remain such until
it either becomes someone's child in another namespace or goes through
umount_tree()), or
	* it is the root of some tree copy, and is currently pinned
by the caller of copy_tree() (and remains such until it either gets
into namespace, or goes to umount_tree()).
In all cases we already have contribution(s) to refcount that will last
as long as the contribution from children remains.  In other words, the
lifetime is not affected by refcount contributions from children.

It might be useful for "is it busy" checks, but those are actually
no harder to express without it.

NB: propagate_mnt_busy() part is an equivalent transformation, ugly as it
is; the current logics is actually wrong and may give false negatives,
but fixing that is for a separate patch (probably earlier in the queue).

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 31 +++++++++----------------------
 fs/pnode.c     | 49 +++++++++++++++++--------------------------------
 2 files changed, 26 insertions(+), 54 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 6df0436bfcb9..4bdf6a6e75ca 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1075,7 +1075,6 @@ void mnt_set_mountpoint(struct mount *mnt,
 			struct mountpoint *mp,
 			struct mount *child_mnt)
 {
-	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
 	child_mnt->mnt_mountpoint = mp->m_dentry;
 	child_mnt->mnt_parent = mnt;
 	child_mnt->mnt_mp = mp;
@@ -1118,7 +1117,6 @@ static void attach_mnt(struct mount *mnt, struct mount *parent,
 void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
 {
 	struct mountpoint *old_mp = mnt->mnt_mp;
-	struct mount *old_parent = mnt->mnt_parent;
 
 	list_del_init(&mnt->mnt_child);
 	hlist_del_init(&mnt->mnt_mp_list);
@@ -1127,7 +1125,6 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
 	attach_mnt(mnt, parent, mp);
 
 	maybe_free_mountpoint(old_mp, &ex_mountpoints);
-	mnt_add_count(old_parent, -1);
 }
 
 static inline struct mount *node_to_mount(struct rb_node *node)
@@ -1652,23 +1649,19 @@ const struct seq_operations mounts_op = {
 int may_umount_tree(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
-	int actual_refs = 0;
-	int minimum_refs = 0;
-	struct mount *p;
-	BUG_ON(!m);
+	bool busy = false;
 
 	/* write lock needed for mnt_get_count */
 	lock_mount_hash();
-	for (p = mnt; p; p = next_mnt(p, mnt)) {
-		actual_refs += mnt_get_count(p);
-		minimum_refs += 2;
+	for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) {
+		if (mnt_get_count(p) > (p == mnt ? 2 : 1)) {
+			busy = true;
+			break;
+		}
 	}
 	unlock_mount_hash();
 
-	if (actual_refs > minimum_refs)
-		return 0;
-
-	return 1;
+	return !busy;
 }
 
 EXPORT_SYMBOL(may_umount_tree);
@@ -1869,7 +1862,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 
 		disconnect = disconnect_mount(p, how);
 		if (mnt_has_parent(p)) {
-			mnt_add_count(p->mnt_parent, -1);
 			if (!disconnect) {
 				/* Don't forget about p */
 				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
@@ -1946,7 +1938,7 @@ static int do_umount(struct mount *mnt, int flags)
 		 * all race cases, but it's a slowpath.
 		 */
 		lock_mount_hash();
-		if (mnt_get_count(mnt) != 2) {
+		if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) {
 			unlock_mount_hash();
 			return -EBUSY;
 		}
@@ -3683,9 +3675,7 @@ static int do_move_mount(struct path *old_path,
 out:
 	unlock_mount(&mp);
 	if (!err) {
-		if (!is_anon_ns(ns)) {
-			mntput_no_expire(parent);
-		} else {
+		if (is_anon_ns(ns)) {
 			/* Make sure we notice when we leak mounts. */
 			VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
 			free_mnt_ns(ns);
@@ -4753,7 +4743,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	/* mount new_root on / */
 	attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp);
 	umount_mnt(root_mnt);
-	mnt_add_count(root_parent, -1);
 	/* mount old root on put_old */
 	attach_mnt(root_mnt, old_mnt, old_mp.mp);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
@@ -4766,8 +4755,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	error = 0;
 out4:
 	unlock_mount(&old_mp);
-	if (!error)
-		mntput_no_expire(ex_parent);
 out3:
 	path_put(&root);
 out2:
diff --git a/fs/pnode.c b/fs/pnode.c
index 901d40946d34..827d71736ac5 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -332,21 +332,6 @@ out:
 	return ret;
 }
 
-static struct mount *find_topper(struct mount *mnt)
-{
-	/* If there is exactly one mount covering mnt completely return it. */
-	struct mount *child;
-
-	if (!list_is_singular(&mnt->mnt_mounts))
-		return NULL;
-
-	child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
-	if (child->mnt_mountpoint != mnt->mnt.mnt_root)
-		return NULL;
-
-	return child;
-}
-
 /*
  * return true if the refcount is greater than count
  */
@@ -404,12 +389,8 @@ bool propagation_would_overmount(const struct mount *from,
  */
 int propagate_mount_busy(struct mount *mnt, int refcnt)
 {
-	struct mount *m, *child, *topper;
 	struct mount *parent = mnt->mnt_parent;
 
-	if (mnt == parent)
-		return do_refcount_check(mnt, refcnt);
-
 	/*
 	 * quickly check if the current mount can be unmounted.
 	 * If not, we don't have to go checking for all other
@@ -418,23 +399,27 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
 	if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
 		return 1;
 
-	for (m = propagation_next(parent, parent); m;
+	if (mnt == parent)
+		return 0;
+
+	for (struct mount *m = propagation_next(parent, parent); m;
 	     		m = propagation_next(m, parent)) {
-		int count = 1;
-		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
+		struct list_head *head;
+		struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
+
 		if (!child)
 			continue;
 
-		/* Is there exactly one mount on the child that covers
-		 * it completely whose reference should be ignored?
-		 */
-		topper = find_topper(child);
-		if (topper)
-			count += 1;
-		else if (!list_empty(&child->mnt_mounts))
-			continue;
-
-		if (do_refcount_check(child, count))
+		head = &child->mnt_mounts;
+		if (!list_empty(head)) {
+			/*
+			 * a mount that covers child completely wouldn't prevent
+			 * it being pulled out; any other would.
+			 */
+			if (!list_is_singular(head) || !child->overmount)
+				continue;
+		}
+		if (do_refcount_check(child, 1))
 			return 1;
 	}
 	return 0;

From 406fea79992561f47fd3511dd8b7c8abeeff7045 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 21 Jun 2025 18:06:19 -0400
Subject: [PATCH 29/48] mount: separate the flags accessed only under
 namespace_sem

Several flags are updated and checked only under namespace_sem; we are
already making use of that when we are checking them without mount_lock,
but we have to hold mount_lock for all updates, which makes things
clumsier than they have to be.

Take MNT_SHARED, MNT_UNBINDABLE, MNT_MARKED and MNT_UMOUNT_CANDIDATE
into a separate field (->mnt_t_flags), renaming them to T_SHARED,
etc. to avoid confusion.  All accesses must be under namespace_sem.

That changes locking requirements for mnt_change_propagation() and
set_mnt_shared() - only namespace_sem is needed now.  The same goes
for SET_MNT_MARKED et.al.

There might be more flags moved from ->mnt_flags to that field;
this is just the initial set.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 .../filesystems/propagate_umount.txt          | 12 +++++-----
 fs/mount.h                                    | 17 ++++++++++++++
 fs/namespace.c                                |  4 ----
 fs/pnode.c                                    | 22 +++++++++----------
 fs/pnode.h                                    | 19 +++++++++-------
 include/linux/mount.h                         | 18 ++-------------
 6 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/Documentation/filesystems/propagate_umount.txt b/Documentation/filesystems/propagate_umount.txt
index 6906903a8aa2..c90349e5b889 100644
--- a/Documentation/filesystems/propagate_umount.txt
+++ b/Documentation/filesystems/propagate_umount.txt
@@ -453,11 +453,11 @@ original set.
 So let's go for
 	* original set ("set").  Linkage via mnt_list
 	* undecided candidates ("candidates").  Subset of a list,
-consisting of all its elements marked with a new flag (MNT_UMOUNT_CANDIDATE).
+consisting of all its elements marked with a new flag (T_UMOUNT_CANDIDATE).
 Initially all elements of the list will be marked that way; in the
 end the list will become empty and no mounts will remain marked with
 that flag.
-	* Reuse MNT_MARKED for "has been already seen by trim_ancestors()".
+	* Reuse T_MARKED for "has been already seen by trim_ancestors()".
 	* anything in U that hadn't been in the original set - elements of
 candidates will gradually be either discarded or moved there.  In other
 words, it's the candidates we have already decided to unmount.	Its role
@@ -465,13 +465,13 @@ is reasonably close to the old "to_umount", so let's use that name.
 Linkage via mnt_list.
 
 For gather_candidates() we'll need to maintain both candidates (S -
-set) and intersection of S with set.  Use MNT_UMOUNT_CANDIDATE for
+set) and intersection of S with set.  Use T_UMOUNT_CANDIDATE for
 all elements we encounter, putting the ones not already in the original
 set into the list of candidates.  When we are done, strip that flag from
 all elements of the original set.  That gives a cheap way to check
 if element belongs to S (in gather_candidates) and to candidates
 itself (at later stages).  Call that predicate is_candidate(); it would
-be m->mnt_flags & MNT_UMOUNT_CANDIDATE.
+be m->mnt_t_flags & T_UMOUNT_CANDIDATE.
 
 All elements of the original set are marked with MNT_UMOUNT and we'll
 need the same for elements added when joining the contents of to_umount
@@ -480,5 +480,5 @@ to to_umount; that's close to what the old 'umount_one' is doing, so
 let's keep that name.  It also gives us another predicate we need -
 "belongs to union of set and to_umount"; will_be_unmounted() for now.
 
-Removals from the candidates list should strip both MNT_MARKED and
-MNT_UMOUNT_CANDIDATE; call it remove_from_candidates_list().
+Removals from the candidates list should strip both T_MARKED and
+T_UMOUNT_CANDIDATE; call it remove_from_candidates_list().
diff --git a/fs/mount.h b/fs/mount.h
index 4355c482a841..f299dc85446d 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -84,6 +84,7 @@ struct mount {
 	struct list_head to_notify;	/* need to queue notification */
 	struct mnt_namespace *prev_ns;	/* previous namespace (NULL if none) */
 #endif
+	int mnt_t_flags;		/* namespace_sem-protected flags */
 	int mnt_id;			/* mount identifier, reused */
 	u64 mnt_id_unique;		/* mount ID unique until reboot */
 	int mnt_group_id;		/* peer group identifier */
@@ -93,6 +94,22 @@ struct mount {
 	struct mount *overmount;	/* mounted on ->mnt_root */
 } __randomize_layout;
 
+enum {
+	T_SHARED		= 1, /* mount is shared */
+	T_UNBINDABLE		= 2, /* mount is unbindable */
+	T_MARKED		= 4, /* internal mark for propagate_... */
+	T_UMOUNT_CANDIDATE	= 8, /* for propagate_umount */
+
+	/*
+	 * T_SHARED_MASK is the set of flags that should be cleared when a
+	 * mount becomes shared.  Currently, this is only the flag that says a
+	 * mount cannot be bind mounted, since this is how we create a mount
+	 * that shares events with another mount.  If you add a new T_*
+	 * flag, consider how it interacts with shared mounts.
+	 */
+	T_SHARED_MASK	= T_UNBINDABLE,
+};
+
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
 
 static inline struct mount *real_mount(struct vfsmount *mnt)
diff --git a/fs/namespace.c b/fs/namespace.c
index 4bdf6a6e75ca..da27365418a5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2917,10 +2917,8 @@ static int do_change_type(struct path *path, int ms_flags)
 			goto out_unlock;
 	}
 
-	lock_mount_hash();
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
-	unlock_mount_hash();
 
  out_unlock:
 	namespace_unlock();
@@ -3409,9 +3407,7 @@ static int do_set_group(struct path *from_path, struct path *to_path)
 	if (IS_MNT_SHARED(from)) {
 		to->mnt_group_id = from->mnt_group_id;
 		list_add(&to->mnt_share, &from->mnt_share);
-		lock_mount_hash();
 		set_mnt_shared(to);
-		unlock_mount_hash();
 	}
 
 	err = 0;
diff --git a/fs/pnode.c b/fs/pnode.c
index 827d71736ac5..b997663de6d0 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -112,7 +112,7 @@ static int do_make_slave(struct mount *mnt)
 }
 
 /*
- * vfsmount lock must be held for write
+ * EXCL[namespace_sem]
  */
 void change_mnt_propagation(struct mount *mnt, int type)
 {
@@ -125,9 +125,9 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		list_del_init(&mnt->mnt_slave);
 		mnt->mnt_master = NULL;
 		if (type == MS_UNBINDABLE)
-			mnt->mnt.mnt_flags |= MNT_UNBINDABLE;
+			mnt->mnt_t_flags |= T_UNBINDABLE;
 		else
-			mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE;
+			mnt->mnt_t_flags &= ~T_UNBINDABLE;
 	}
 }
 
@@ -263,9 +263,9 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 		return PTR_ERR(child);
 	read_seqlock_excl(&mount_lock);
 	mnt_set_mountpoint(m, dest_mp, child);
+	read_sequnlock_excl(&mount_lock);
 	if (m->mnt_master != dest_master)
 		SET_MNT_MARK(m->mnt_master);
-	read_sequnlock_excl(&mount_lock);
 	last_dest = m;
 	last_source = child;
 	hlist_add_head(&child->mnt_hash, list);
@@ -322,13 +322,11 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		} while (n != m);
 	}
 out:
-	read_seqlock_excl(&mount_lock);
 	hlist_for_each_entry(n, tree_list, mnt_hash) {
 		m = n->mnt_parent;
 		if (m->mnt_master != dest_mnt->mnt_master)
 			CLEAR_MNT_MARK(m->mnt_master);
 	}
-	read_sequnlock_excl(&mount_lock);
 	return ret;
 }
 
@@ -447,7 +445,7 @@ void propagate_mount_unlock(struct mount *mnt)
 
 static inline bool is_candidate(struct mount *m)
 {
-	return m->mnt.mnt_flags & MNT_UMOUNT_CANDIDATE;
+	return m->mnt_t_flags & T_UMOUNT_CANDIDATE;
 }
 
 static inline bool will_be_unmounted(struct mount *m)
@@ -464,7 +462,7 @@ static void umount_one(struct mount *m, struct list_head *to_umount)
 
 static void remove_from_candidate_list(struct mount *m)
 {
-	m->mnt.mnt_flags &= ~(MNT_MARKED | MNT_UMOUNT_CANDIDATE);
+	m->mnt_t_flags &= ~(T_MARKED | T_UMOUNT_CANDIDATE);
 	list_del_init(&m->mnt_list);
 }
 
@@ -476,7 +474,7 @@ static void gather_candidates(struct list_head *set,
 	list_for_each_entry(m, set, mnt_list) {
 		if (is_candidate(m))
 			continue;
-		m->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
+		m->mnt_t_flags |= T_UMOUNT_CANDIDATE;
 		p = m->mnt_parent;
 		q = propagation_next(p, p);
 		while (q) {
@@ -494,7 +492,7 @@ static void gather_candidates(struct list_head *set,
 					q = skip_propagation_subtree(q, p);
 					continue;
 				}
-				child->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
+				child->mnt_t_flags |= T_UMOUNT_CANDIDATE;
 				if (!will_be_unmounted(child))
 					list_add(&child->mnt_list, candidates);
 			}
@@ -502,7 +500,7 @@ static void gather_candidates(struct list_head *set,
 		}
 	}
 	list_for_each_entry(m, set, mnt_list)
-		m->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
+		m->mnt_t_flags &= ~T_UMOUNT_CANDIDATE;
 }
 
 /*
@@ -519,7 +517,7 @@ static void trim_ancestors(struct mount *m)
 			return;
 		SET_MNT_MARK(m);
 		if (m != p->overmount)
-			p->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
+			p->mnt_t_flags &= ~T_UMOUNT_CANDIDATE;
 	}
 }
 
diff --git a/fs/pnode.h b/fs/pnode.h
index 04f1ac53aa49..507e30e7a420 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -10,14 +10,14 @@
 #include <linux/list.h>
 #include "mount.h"
 
-#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
+#define IS_MNT_SHARED(m) ((m)->mnt_t_flags & T_SHARED)
 #define IS_MNT_SLAVE(m) ((m)->mnt_master)
 #define IS_MNT_NEW(m) (!(m)->mnt_ns)
-#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
-#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
-#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
-#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
-#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
+#define CLEAR_MNT_SHARED(m) ((m)->mnt_t_flags &= ~T_SHARED)
+#define IS_MNT_UNBINDABLE(m) ((m)->mnt_t_flags & T_UNBINDABLE)
+#define IS_MNT_MARKED(m) ((m)->mnt_t_flags & T_MARKED)
+#define SET_MNT_MARK(m) ((m)->mnt_t_flags |= T_MARKED)
+#define CLEAR_MNT_MARK(m) ((m)->mnt_t_flags &= ~T_MARKED)
 #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
 
 #define CL_EXPIRE    		0x01
@@ -28,10 +28,13 @@
 #define CL_SHARED_TO_SLAVE	0x20
 #define CL_COPY_MNT_NS_FILE	0x40
 
+/*
+ * EXCL[namespace_sem]
+ */
 static inline void set_mnt_shared(struct mount *mnt)
 {
-	mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK;
-	mnt->mnt.mnt_flags |= MNT_SHARED;
+	mnt->mnt_t_flags &= ~T_SHARED_MASK;
+	mnt->mnt_t_flags |= T_SHARED;
 }
 
 static inline bool peers(const struct mount *m1, const struct mount *m2)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 65fa8442c00a..5f9c053b0897 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -35,12 +35,8 @@ enum mount_flags {
 	MNT_SHRINKABLE	= 0x100,
 	MNT_WRITE_HOLD	= 0x200,
 
-	MNT_SHARED	= 0x1000, /* if the vfsmount is a shared mount */
-	MNT_UNBINDABLE	= 0x2000, /* if the vfsmount is a unbindable mount */
-
 	MNT_INTERNAL	= 0x4000,
 
-	MNT_UMOUNT_CANDIDATE	= 0x020000,
 	MNT_LOCK_ATIME		= 0x040000,
 	MNT_LOCK_NOEXEC		= 0x080000,
 	MNT_LOCK_NOSUID		= 0x100000,
@@ -49,25 +45,15 @@ enum mount_flags {
 	MNT_LOCKED		= 0x800000,
 	MNT_DOOMED		= 0x1000000,
 	MNT_SYNC_UMOUNT		= 0x2000000,
-	MNT_MARKED		= 0x4000000,
 	MNT_UMOUNT		= 0x8000000,
 
-	/*
-	 * MNT_SHARED_MASK is the set of flags that should be cleared when a
-	 * mount becomes shared.  Currently, this is only the flag that says a
-	 * mount cannot be bind mounted, since this is how we create a mount
-	 * that shares events with another mount.  If you add a new MNT_*
-	 * flag, consider how it interacts with shared mounts.
-	 */
-	MNT_SHARED_MASK	= MNT_UNBINDABLE,
 	MNT_USER_SETTABLE_MASK  = MNT_NOSUID | MNT_NODEV | MNT_NOEXEC
 				  | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME
 				  | MNT_READONLY | MNT_NOSYMFOLLOW,
 	MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME,
 
-	MNT_INTERNAL_FLAGS = MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL |
-			     MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED |
-			     MNT_LOCKED | MNT_UMOUNT_CANDIDATE,
+	MNT_INTERNAL_FLAGS = MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED |
+			     MNT_SYNC_UMOUNT | MNT_LOCKED
 };
 
 struct vfsmount {

From 25776a09d802f4e4d8c0bd72042934223286c2e3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 21 Jun 2025 17:41:40 -0400
Subject: [PATCH 30/48] propagate_one(): get rid of dest_master

propagate_mnt() takes the subtree we are about to attach and creates
its copies, setting the propagation between those.  Each copy is cloned
either from the original or from one of the already created copies.
The tricky part is choosing the right copy to serve as a master when we
are starting a new peer group.

The algorithm for doing that selection puts temporary marks on the masters
of mountpoints that already got a copy created for them; since the initial
peer group might have no master at all, we need to special-case that when
looking for the mark.  Currently we do that by memorizing the master of
original peer group.  It works, but we get yet another piece of data to
pass from propagate_mnt() to propagate_one().

Alternative is to mark the master of original peer group if not NULL,
turning the check into "master is NULL or marked".  Less data to pass
around and memory safety is more obvious that way...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index b997663de6d0..870ebced10aa 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -215,7 +215,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 }
 
 /* all accesses are serialized by namespace_sem */
-static struct mount *last_dest, *first_source, *last_source, *dest_master;
+static struct mount *last_dest, *first_source, *last_source;
 static struct hlist_head *list;
 
 static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
@@ -239,7 +239,7 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 		bool done;
 		for (n = m; ; n = p) {
 			p = n->mnt_master;
-			if (p == dest_master || IS_MNT_MARKED(p))
+			if (!p || IS_MNT_MARKED(p))
 				break;
 		}
 		do {
@@ -264,7 +264,7 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 	read_seqlock_excl(&mount_lock);
 	mnt_set_mountpoint(m, dest_mp, child);
 	read_sequnlock_excl(&mount_lock);
-	if (m->mnt_master != dest_master)
+	if (m->mnt_master)
 		SET_MNT_MARK(m->mnt_master);
 	last_dest = m;
 	last_source = child;
@@ -300,7 +300,8 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 	first_source = source_mnt;
 	last_source = source_mnt;
 	list = tree_list;
-	dest_master = dest_mnt->mnt_master;
+	if (dest_mnt->mnt_master)
+		SET_MNT_MARK(dest_mnt->mnt_master);
 
 	/* all peers of dest_mnt, except dest_mnt itself */
 	for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
@@ -324,9 +325,11 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 out:
 	hlist_for_each_entry(n, tree_list, mnt_hash) {
 		m = n->mnt_parent;
-		if (m->mnt_master != dest_mnt->mnt_master)
+		if (m->mnt_master)
 			CLEAR_MNT_MARK(m->mnt_master);
 	}
+	if (dest_mnt->mnt_master)
+		CLEAR_MNT_MARK(dest_mnt->mnt_master);
 	return ret;
 }
 

From 2b2a34793dc239b0eaebe9559557d051524a7297 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 22:56:43 -0400
Subject: [PATCH 31/48] propagate_mnt(): handle all peer groups in the same
 loop

the only difference is that for the original group we want to skip
the first element; not worth having the logics twice...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 870ebced10aa..f55295e26217 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -289,7 +289,7 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		    struct mount *source_mnt, struct hlist_head *tree_list)
 {
 	struct mount *m, *n;
-	int ret = 0;
+	int err = 0;
 
 	/*
 	 * we don't want to bother passing tons of arguments to
@@ -303,26 +303,23 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 	if (dest_mnt->mnt_master)
 		SET_MNT_MARK(dest_mnt->mnt_master);
 
-	/* all peers of dest_mnt, except dest_mnt itself */
-	for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
-		ret = propagate_one(n, dest_mp);
-		if (ret)
-			goto out;
-	}
-
-	/* all slave groups */
-	for (m = next_group(dest_mnt, dest_mnt); m;
-			m = next_group(m, dest_mnt)) {
-		/* everything in that slave group */
-		n = m;
+	/* iterate over peer groups, depth first */
+	for (m = dest_mnt; m && !err; m = next_group(m, dest_mnt)) {
+		if (m == dest_mnt) { // have one for dest_mnt itself
+			n = next_peer(m);
+			if (n == m)
+				continue;
+		} else {
+			n = m;
+		}
 		do {
-			ret = propagate_one(n, dest_mp);
-			if (ret)
-				goto out;
+			err = propagate_one(n, dest_mp);
+			if (err)
+				break;
 			n = next_peer(n);
 		} while (n != m);
 	}
-out:
+
 	hlist_for_each_entry(n, tree_list, mnt_hash) {
 		m = n->mnt_parent;
 		if (m->mnt_master)
@@ -330,7 +327,7 @@ out:
 	}
 	if (dest_mnt->mnt_master)
 		CLEAR_MNT_MARK(dest_mnt->mnt_master);
-	return ret;
+	return err;
 }
 
 /*

From 15e710b8bbb5c537c39ccaa23963d01c76946834 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 23:04:23 -0400
Subject: [PATCH 32/48] propagate_one(): separate the "do we need secondary
 here?" logics

take the checks into separate helper - need_secondary(mount, mountpoint).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index f55295e26217..7c832f98595c 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -218,19 +218,24 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 static struct mount *last_dest, *first_source, *last_source;
 static struct hlist_head *list;
 
+static bool need_secondary(struct mount *m, struct mountpoint *dest_mp)
+{
+	/* skip ones added by this propagate_mnt() */
+	if (IS_MNT_NEW(m))
+		return false;
+	/* skip if mountpoint isn't visible in m */
+	if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
+		return false;
+	/* skip if m is in the anon_ns */
+	if (is_anon_ns(m->mnt_ns))
+		return false;
+	return true;
+}
+
 static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 {
 	struct mount *child;
 	int type;
-	/* skip ones added by this propagate_mnt() */
-	if (IS_MNT_NEW(m))
-		return 0;
-	/* skip if mountpoint isn't visible in m */
-	if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
-		return 0;
-	/* skip if m is in the anon_ns */
-	if (is_anon_ns(m->mnt_ns))
-		return 0;
 
 	if (peers(m, last_dest)) {
 		type = CL_MAKE_SHARED;
@@ -313,11 +318,12 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 			n = m;
 		}
 		do {
+			if (!need_secondary(n, dest_mp))
+				continue;
 			err = propagate_one(n, dest_mp);
 			if (err)
 				break;
-			n = next_peer(n);
-		} while (n != m);
+		} while ((n = next_peer(n)) != m);
 	}
 
 	hlist_for_each_entry(n, tree_list, mnt_hash) {

From e0f9396e244c0dcc29921ebc3254cb25d6eb31cf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 23:09:47 -0400
Subject: [PATCH 33/48] propagate_one(): separate the "what should be the
 master for this copy" part

When we create the first copy for a peer group, it becomes a slave of
one of the existing copies; take that logics into a separate helper -
find_master(parent, last_copy, original).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 7c832f98595c..94de8aad4da5 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -232,6 +232,31 @@ static bool need_secondary(struct mount *m, struct mountpoint *dest_mp)
 	return true;
 }
 
+static struct mount *find_master(struct mount *m,
+				struct mount *last_copy,
+				struct mount *original)
+{
+	struct mount *p;
+
+	// ascend until there's a copy for something with the same master
+	for (;;) {
+		p = m->mnt_master;
+		if (!p || IS_MNT_MARKED(p))
+			break;
+		m = p;
+	}
+	while (!peers(last_copy, original)) {
+		struct mount *parent = last_copy->mnt_parent;
+		if (parent->mnt_master == p) {
+			if (!peers(parent, m))
+				last_copy = last_copy->mnt_master;
+			break;
+		}
+		last_copy = last_copy->mnt_master;
+	}
+	return last_copy;
+}
+
 static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 {
 	struct mount *child;
@@ -240,23 +265,7 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 	if (peers(m, last_dest)) {
 		type = CL_MAKE_SHARED;
 	} else {
-		struct mount *n, *p;
-		bool done;
-		for (n = m; ; n = p) {
-			p = n->mnt_master;
-			if (!p || IS_MNT_MARKED(p))
-				break;
-		}
-		do {
-			struct mount *parent = last_source->mnt_parent;
-			if (peers(last_source, first_source))
-				break;
-			done = parent->mnt_master == p;
-			if (done && peers(n, parent))
-				break;
-			last_source = last_source->mnt_master;
-		} while (!done);
-
+		last_source = find_master(m, last_source, first_source);
 		type = CL_SLAVE;
 		/* beginning of peer group among the slaves? */
 		if (IS_MNT_SHARED(m))

From 6a2ce2a74bfeee4b66411177125f3b5749003e7f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 23:16:52 -0400
Subject: [PATCH 34/48] propagate_one(): fold into the sole caller

mechanical expansion; will be cleaned up on the next step

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 57 ++++++++++++++++++++++++------------------------------
 1 file changed, 25 insertions(+), 32 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 94de8aad4da5..aeaec24f7456 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -257,35 +257,6 @@ static struct mount *find_master(struct mount *m,
 	return last_copy;
 }
 
-static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
-{
-	struct mount *child;
-	int type;
-
-	if (peers(m, last_dest)) {
-		type = CL_MAKE_SHARED;
-	} else {
-		last_source = find_master(m, last_source, first_source);
-		type = CL_SLAVE;
-		/* beginning of peer group among the slaves? */
-		if (IS_MNT_SHARED(m))
-			type |= CL_MAKE_SHARED;
-	}
-		
-	child = copy_tree(last_source, last_source->mnt.mnt_root, type);
-	if (IS_ERR(child))
-		return PTR_ERR(child);
-	read_seqlock_excl(&mount_lock);
-	mnt_set_mountpoint(m, dest_mp, child);
-	read_sequnlock_excl(&mount_lock);
-	if (m->mnt_master)
-		SET_MNT_MARK(m->mnt_master);
-	last_dest = m;
-	last_source = child;
-	hlist_add_head(&child->mnt_hash, list);
-	return count_mounts(m->mnt_ns, child);
-}
-
 /*
  * mount 'source_mnt' under the destination 'dest_mnt' at
  * dentry 'dest_dentry'. And propagate that mount to
@@ -302,8 +273,8 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		    struct mount *source_mnt, struct hlist_head *tree_list)
 {
-	struct mount *m, *n;
-	int err = 0;
+	struct mount *m, *n, *child;
+	int err = 0, type;
 
 	/*
 	 * we don't want to bother passing tons of arguments to
@@ -329,7 +300,29 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		do {
 			if (!need_secondary(n, dest_mp))
 				continue;
-			err = propagate_one(n, dest_mp);
+			if (peers(n, last_dest)) {
+				type = CL_MAKE_SHARED;
+			} else {
+				last_source = find_master(n, last_source, first_source);
+				type = CL_SLAVE;
+				/* beginning of peer group among the slaves? */
+				if (IS_MNT_SHARED(n))
+					type |= CL_MAKE_SHARED;
+			}
+			child = copy_tree(last_source, last_source->mnt.mnt_root, type);
+			if (IS_ERR(child)) {
+				err = PTR_ERR(child);
+				break;
+			}
+			read_seqlock_excl(&mount_lock);
+			mnt_set_mountpoint(n, dest_mp, child);
+			read_sequnlock_excl(&mount_lock);
+			if (n->mnt_master)
+				SET_MNT_MARK(n->mnt_master);
+			last_dest = n;
+			last_source = child;
+			hlist_add_head(&child->mnt_hash, list);
+			err = count_mounts(n->mnt_ns, child);
 			if (err)
 				break;
 		} while ((n = next_peer(n)) != m);

From bc88530a20b1d5c78288ef5383d10b66d3242c48 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 23:21:57 -0400
Subject: [PATCH 35/48] fs/pnode.c: get rid of globals

this stuff can be local in propagate_mnt() now (and in some cases
duplicates the existing variables there)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index aeaec24f7456..e01f43820a93 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -214,10 +214,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 	}
 }
 
-/* all accesses are serialized by namespace_sem */
-static struct mount *last_dest, *first_source, *last_source;
-static struct hlist_head *list;
-
 static bool need_secondary(struct mount *m, struct mountpoint *dest_mp)
 {
 	/* skip ones added by this propagate_mnt() */
@@ -273,18 +269,11 @@ static struct mount *find_master(struct mount *m,
 int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		    struct mount *source_mnt, struct hlist_head *tree_list)
 {
-	struct mount *m, *n, *child;
+	struct mount *m, *n, *copy, *this, *last_dest;
 	int err = 0, type;
 
-	/*
-	 * we don't want to bother passing tons of arguments to
-	 * propagate_one(); everything is serialized by namespace_sem,
-	 * so globals will do just fine.
-	 */
 	last_dest = dest_mnt;
-	first_source = source_mnt;
-	last_source = source_mnt;
-	list = tree_list;
+	copy = source_mnt;
 	if (dest_mnt->mnt_master)
 		SET_MNT_MARK(dest_mnt->mnt_master);
 
@@ -303,26 +292,26 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 			if (peers(n, last_dest)) {
 				type = CL_MAKE_SHARED;
 			} else {
-				last_source = find_master(n, last_source, first_source);
+				copy = find_master(n, copy, source_mnt);
 				type = CL_SLAVE;
 				/* beginning of peer group among the slaves? */
 				if (IS_MNT_SHARED(n))
 					type |= CL_MAKE_SHARED;
 			}
-			child = copy_tree(last_source, last_source->mnt.mnt_root, type);
-			if (IS_ERR(child)) {
-				err = PTR_ERR(child);
+			this = copy_tree(copy, copy->mnt.mnt_root, type);
+			if (IS_ERR(this)) {
+				err = PTR_ERR(this);
 				break;
 			}
 			read_seqlock_excl(&mount_lock);
-			mnt_set_mountpoint(n, dest_mp, child);
+			mnt_set_mountpoint(n, dest_mp, this);
 			read_sequnlock_excl(&mount_lock);
 			if (n->mnt_master)
 				SET_MNT_MARK(n->mnt_master);
 			last_dest = n;
-			last_source = child;
-			hlist_add_head(&child->mnt_hash, list);
-			err = count_mounts(n->mnt_ns, child);
+			copy = this;
+			hlist_add_head(&this->mnt_hash, tree_list);
+			err = count_mounts(n->mnt_ns, this);
 			if (err)
 				break;
 		} while ((n = next_peer(n)) != m);

From 0a10217e5cf82835b63875752b57f01bba0bf5b6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 23:27:48 -0400
Subject: [PATCH 36/48] propagate_mnt(): get rid of last_dest

Its only use is choosing the type of copy - CL_MAKE_SHARED if there
already is a copy in that peer group, CL_SLAVE or CL_SLAVE | CL_MAKE_SHARED
otherwise.

But that's easy to keep track of - just set type in the beginning of group
and reset to CL_MAKE_SHARED after the first created secondary in it...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index e01f43820a93..b3af55123a82 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -269,35 +269,32 @@ static struct mount *find_master(struct mount *m,
 int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		    struct mount *source_mnt, struct hlist_head *tree_list)
 {
-	struct mount *m, *n, *copy, *this, *last_dest;
+	struct mount *m, *n, *copy, *this;
 	int err = 0, type;
 
-	last_dest = dest_mnt;
-	copy = source_mnt;
 	if (dest_mnt->mnt_master)
 		SET_MNT_MARK(dest_mnt->mnt_master);
 
 	/* iterate over peer groups, depth first */
 	for (m = dest_mnt; m && !err; m = next_group(m, dest_mnt)) {
 		if (m == dest_mnt) { // have one for dest_mnt itself
+			copy = source_mnt;
+			type = CL_MAKE_SHARED;
 			n = next_peer(m);
 			if (n == m)
 				continue;
 		} else {
+			type = CL_SLAVE;
+			/* beginning of peer group among the slaves? */
+			if (IS_MNT_SHARED(m))
+				type |= CL_MAKE_SHARED;
 			n = m;
 		}
 		do {
 			if (!need_secondary(n, dest_mp))
 				continue;
-			if (peers(n, last_dest)) {
-				type = CL_MAKE_SHARED;
-			} else {
+			if (type & CL_SLAVE) // first in this peer group
 				copy = find_master(n, copy, source_mnt);
-				type = CL_SLAVE;
-				/* beginning of peer group among the slaves? */
-				if (IS_MNT_SHARED(n))
-					type |= CL_MAKE_SHARED;
-			}
 			this = copy_tree(copy, copy->mnt.mnt_root, type);
 			if (IS_ERR(this)) {
 				err = PTR_ERR(this);
@@ -308,12 +305,12 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 			read_sequnlock_excl(&mount_lock);
 			if (n->mnt_master)
 				SET_MNT_MARK(n->mnt_master);
-			last_dest = n;
 			copy = this;
 			hlist_add_head(&this->mnt_hash, tree_list);
 			err = count_mounts(n->mnt_ns, this);
 			if (err)
 				break;
+			type = CL_MAKE_SHARED;
 		} while ((n = next_peer(n)) != m);
 	}
 

From 0313356520b15deab893cd62da3e2ba9a6e61a1f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 23:39:23 -0400
Subject: [PATCH 37/48] propagate_mnt(): fix comment and convert to kernel-doc,
 while we are at it

Mountpoint is passed as struct mountpoint *, not struct dentry *
(and called dest_mp, not dest_dentry) since 2013.

Roots of created copies are linked via mnt_hash, not mnt_list since
a bit before the merge into mainline back in 2005.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index b3af55123a82..b887116f0041 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -253,21 +253,20 @@ static struct mount *find_master(struct mount *m,
 	return last_copy;
 }
 
-/*
- * mount 'source_mnt' under the destination 'dest_mnt' at
- * dentry 'dest_dentry'. And propagate that mount to
- * all the peer and slave mounts of 'dest_mnt'.
- * Link all the new mounts into a propagation tree headed at
- * source_mnt. Also link all the new mounts using ->mnt_list
- * headed at source_mnt's ->mnt_list
+/**
+ * propagate_mnt() - create secondary copies for tree attachment
+ * @dest_mnt:    destination mount.
+ * @dest_mp:     destination mountpoint.
+ * @source_mnt:  source mount.
+ * @tree_list:   list of secondaries to be attached.
  *
- * @dest_mnt: destination mount.
- * @dest_dentry: destination dentry.
- * @source_mnt: source mount.
- * @tree_list : list of heads of trees to be attached.
+ * Create secondary copies for attaching a tree with root @source_mnt
+ * at mount @dest_mnt with mountpoint @dest_mp.  Link all new mounts
+ * into a propagation graph.  Set mountpoints for all secondaries,
+ * link their roots into @tree_list via ->mnt_hash.
  */
 int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
-		    struct mount *source_mnt, struct hlist_head *tree_list)
+		  struct mount *source_mnt, struct hlist_head *tree_list)
 {
 	struct mount *m, *n, *copy, *this;
 	int err = 0, type;

From d5f15047f13b86b74d1ac4f39036ccae2078c492 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Jun 2025 23:20:47 -0400
Subject: [PATCH 38/48] change_mnt_propagation() cleanups, step 1

Lift changing ->mnt_slave from do_make_slave() into the caller.
Simplifies the next steps...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index b887116f0041..14618eac2025 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -104,7 +104,6 @@ static int do_make_slave(struct mount *mnt)
 	}
 	list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
 		slave_mnt->mnt_master = master;
-	list_move(&mnt->mnt_slave, &master->mnt_slave_list);
 	list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
 	INIT_LIST_HEAD(&mnt->mnt_slave_list);
 	mnt->mnt_master = master;
@@ -121,8 +120,12 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		return;
 	}
 	do_make_slave(mnt);
-	if (type != MS_SLAVE) {
-		list_del_init(&mnt->mnt_slave);
+	list_del_init(&mnt->mnt_slave);
+	if (type == MS_SLAVE) {
+		if (mnt->mnt_master)
+			list_add(&mnt->mnt_slave,
+				 &mnt->mnt_master->mnt_slave_list);
+	} else {
 		mnt->mnt_master = NULL;
 		if (type == MS_UNBINDABLE)
 			mnt->mnt_t_flags |= T_UNBINDABLE;

From ef86251194de9b31f7efcf70ea480ebe736e9e60 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Jun 2025 23:25:00 -0400
Subject: [PATCH 39/48] change_mnt_propagation(): do_make_slave() is a no-op
 unless IS_MNT_SHARED()

... since mnt->mnt_share and mnt->mnt_slave_list are guaranteed to be empty unless
IS_MNT_SHARED(mnt).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 14618eac2025..9723f05cda5f 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -70,10 +70,8 @@ static int do_make_slave(struct mount *mnt)
 	struct mount *master, *slave_mnt;
 
 	if (list_empty(&mnt->mnt_share)) {
-		if (IS_MNT_SHARED(mnt)) {
-			mnt_release_group_id(mnt);
-			CLEAR_MNT_SHARED(mnt);
-		}
+		mnt_release_group_id(mnt);
+		CLEAR_MNT_SHARED(mnt);
 		master = mnt->mnt_master;
 		if (!master) {
 			struct list_head *p = &mnt->mnt_slave_list;
@@ -119,7 +117,8 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		set_mnt_shared(mnt);
 		return;
 	}
-	do_make_slave(mnt);
+	if (IS_MNT_SHARED(mnt))
+		do_make_slave(mnt);
 	list_del_init(&mnt->mnt_slave);
 	if (type == MS_SLAVE) {
 		if (mnt->mnt_master)

From 955336e204ab59301ff8b1f75a98a226f5a98782 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Jun 2025 18:12:56 -0400
Subject: [PATCH 40/48] do_make_slave(): choose new master sanely

When mount changes propagation type so that it doesn't propagate
events any more (MS_PRIVATE, MS_SLAVE, MS_UNBINDABLE), we need
to make sure that event propagation between other mounts is
unaffected.

We need to make sure that events from peers and master of that mount
(if any) still reach everything that used to be on its ->mnt_slave_list.

If mount has neither peers nor master, we simply need to dissolve
its ->mnt_slave_list and clear ->mnt_master of everything in there.

If mount has peers, we transfer everything in ->mnt_slave_list of
this mount into that of some of those peers (and adjust ->mnt_master
accordingly).

If mount has a master but no peers, we transfer everything in
->mnt_slave_list of this mount into that of its master (adjusting
->mnt_master, etc.).

There are two problems with the current implementation:
	* there's a long-obsolete logics in choosing the peer -
once upon a time it made sense to prefer the peer that had the
same ->mnt_root as our mount, but that had been pointless since
2014 ("smarter propagate_mnt()")
	* the most common caller of that thing is umount_tree()
taking the mounts out of propagation graph.  In that case it's
possible to have ->mnt_slave_list contents moved many times,
since the replacement master is likely to be taken out by the
same umount_tree(), etc.

Take the choice of replacement master into a separate function
(propagation_source()) and teach it to skip the candidates that
are going to be taken out.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 62 +++++++++++++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 9723f05cda5f..91d10af867bd 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -65,40 +65,45 @@ int get_dominating_id(struct mount *mnt, const struct path *root)
 	return 0;
 }
 
+static inline bool will_be_unmounted(struct mount *m)
+{
+	return m->mnt.mnt_flags & MNT_UMOUNT;
+}
+
+static struct mount *propagation_source(struct mount *mnt)
+{
+	do {
+		struct mount *m;
+		for (m = next_peer(mnt); m != mnt; m = next_peer(m)) {
+			if (!will_be_unmounted(m))
+				return m;
+		}
+		mnt = mnt->mnt_master;
+	} while (mnt && will_be_unmounted(mnt));
+	return mnt;
+}
+
 static int do_make_slave(struct mount *mnt)
 {
-	struct mount *master, *slave_mnt;
+	struct mount *master = propagation_source(mnt);
+	struct mount *slave_mnt;
 
 	if (list_empty(&mnt->mnt_share)) {
 		mnt_release_group_id(mnt);
-		CLEAR_MNT_SHARED(mnt);
-		master = mnt->mnt_master;
-		if (!master) {
-			struct list_head *p = &mnt->mnt_slave_list;
-			while (!list_empty(p)) {
-				slave_mnt = list_first_entry(p,
-						struct mount, mnt_slave);
-				list_del_init(&slave_mnt->mnt_slave);
-				slave_mnt->mnt_master = NULL;
-			}
-			return 0;
-		}
 	} else {
-		struct mount *m;
-		/*
-		 * slave 'mnt' to a peer mount that has the
-		 * same root dentry. If none is available then
-		 * slave it to anything that is available.
-		 */
-		for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) {
-			if (m->mnt.mnt_root == mnt->mnt.mnt_root) {
-				master = m;
-				break;
-			}
-		}
 		list_del_init(&mnt->mnt_share);
 		mnt->mnt_group_id = 0;
-		CLEAR_MNT_SHARED(mnt);
+	}
+	CLEAR_MNT_SHARED(mnt);
+	if (!master) {
+		struct list_head *p = &mnt->mnt_slave_list;
+		while (!list_empty(p)) {
+			slave_mnt = list_first_entry(p,
+					struct mount, mnt_slave);
+			list_del_init(&slave_mnt->mnt_slave);
+			slave_mnt->mnt_master = NULL;
+		}
+		return 0;
 	}
 	list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
 		slave_mnt->mnt_master = master;
@@ -443,11 +448,6 @@ static inline bool is_candidate(struct mount *m)
 	return m->mnt_t_flags & T_UMOUNT_CANDIDATE;
 }
 
-static inline bool will_be_unmounted(struct mount *m)
-{
-	return m->mnt.mnt_flags & MNT_UMOUNT;
-}
-
 static void umount_one(struct mount *m, struct list_head *to_umount)
 {
 	m->mnt.mnt_flags |= MNT_UMOUNT;

From 94a8d0027606397ce58b00077bf6146f25923965 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Jun 2025 23:36:43 -0400
Subject: [PATCH 41/48] turn do_make_slave() into transfer_propagation()

Lift calculation of replacement propagation source, removal from
peer group and assignment of ->mnt_master from do_make_slave() into
change_mnt_propagation() itself.  What remains is switching of
what used to get propagation *through* mnt to alternative source.
Rename to transfer_propagation(), passing it the replacement source
as the second argument.  Have it return void, while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 91d10af867bd..0a54848cbbd1 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -83,19 +83,10 @@ static struct mount *propagation_source(struct mount *mnt)
 	return mnt;
 }
 
-static int do_make_slave(struct mount *mnt)
+static void transfer_propagation(struct mount *mnt, struct mount *to)
 {
-	struct mount *master = propagation_source(mnt);
 	struct mount *slave_mnt;
-
-	if (list_empty(&mnt->mnt_share)) {
-		mnt_release_group_id(mnt);
-	} else {
-		list_del_init(&mnt->mnt_share);
-		mnt->mnt_group_id = 0;
-	}
-	CLEAR_MNT_SHARED(mnt);
-	if (!master) {
+	if (!to) {
 		struct list_head *p = &mnt->mnt_slave_list;
 		while (!list_empty(p)) {
 			slave_mnt = list_first_entry(p,
@@ -103,14 +94,12 @@ static int do_make_slave(struct mount *mnt)
 			list_del_init(&slave_mnt->mnt_slave);
 			slave_mnt->mnt_master = NULL;
 		}
-		return 0;
+		return;
 	}
 	list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
-		slave_mnt->mnt_master = master;
-	list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
+		slave_mnt->mnt_master = to;
+	list_splice(&mnt->mnt_slave_list, to->mnt_slave_list.prev);
 	INIT_LIST_HEAD(&mnt->mnt_slave_list);
-	mnt->mnt_master = master;
-	return 0;
 }
 
 /*
@@ -122,8 +111,19 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		set_mnt_shared(mnt);
 		return;
 	}
-	if (IS_MNT_SHARED(mnt))
-		do_make_slave(mnt);
+	if (IS_MNT_SHARED(mnt)) {
+		struct mount *m = propagation_source(mnt);
+
+		if (list_empty(&mnt->mnt_share)) {
+			mnt_release_group_id(mnt);
+		} else {
+			list_del_init(&mnt->mnt_share);
+			mnt->mnt_group_id = 0;
+		}
+		CLEAR_MNT_SHARED(mnt);
+		transfer_propagation(mnt, m);
+		mnt->mnt_master = m;
+	}
 	list_del_init(&mnt->mnt_slave);
 	if (type == MS_SLAVE) {
 		if (mnt->mnt_master)

From 8c5a853f58c5b86b033842b78a0ad3d1208672fa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Jun 2025 23:51:31 -0400
Subject: [PATCH 42/48] mnt_slave_list/mnt_slave: turn into
 hlist_head/hlist_node

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mount.h     |  4 ++--
 fs/namespace.c | 14 ++++++--------
 fs/pnode.c     | 41 +++++++++++++++++++----------------------
 3 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index f299dc85446d..08583428b10b 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -69,8 +69,8 @@ struct mount {
 	struct list_head mnt_list;
 	struct list_head mnt_expire;	/* link in fs-specific expiry list */
 	struct list_head mnt_share;	/* circular list of shared mounts */
-	struct list_head mnt_slave_list;/* list of slave mounts */
-	struct list_head mnt_slave;	/* slave list entry */
+	struct hlist_head mnt_slave_list;/* list of slave mounts */
+	struct hlist_node mnt_slave;	/* slave list entry */
 	struct mount *mnt_master;	/* slave is on master->mnt_slave_list */
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
diff --git a/fs/namespace.c b/fs/namespace.c
index da27365418a5..38a46b32413d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -380,8 +380,8 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_list);
 		INIT_LIST_HEAD(&mnt->mnt_expire);
 		INIT_LIST_HEAD(&mnt->mnt_share);
-		INIT_LIST_HEAD(&mnt->mnt_slave_list);
-		INIT_LIST_HEAD(&mnt->mnt_slave);
+		INIT_HLIST_HEAD(&mnt->mnt_slave_list);
+		INIT_HLIST_NODE(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
 		RB_CLEAR_NODE(&mnt->mnt_node);
@@ -1348,10 +1348,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 
 	if ((flag & CL_SLAVE) ||
 	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
-		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
+		hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list);
 		mnt->mnt_master = old;
 	} else if (IS_MNT_SLAVE(old)) {
-		list_add(&mnt->mnt_slave, &old->mnt_slave);
+		hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave);
 		mnt->mnt_master = old->mnt_master;
 	}
 	return mnt;
@@ -3398,10 +3398,8 @@ static int do_set_group(struct path *from_path, struct path *to_path)
 		goto out;
 
 	if (IS_MNT_SLAVE(from)) {
-		struct mount *m = from->mnt_master;
-
-		list_add(&to->mnt_slave, &from->mnt_slave);
-		to->mnt_master = m;
+		hlist_add_behind(&to->mnt_slave, &from->mnt_slave);
+		to->mnt_master = from->mnt_master;
 	}
 
 	if (IS_MNT_SHARED(from)) {
diff --git a/fs/pnode.c b/fs/pnode.c
index 0a54848cbbd1..69278079faeb 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -21,12 +21,12 @@ static inline struct mount *next_peer(struct mount *p)
 
 static inline struct mount *first_slave(struct mount *p)
 {
-	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
+	return hlist_entry(p->mnt_slave_list.first, struct mount, mnt_slave);
 }
 
 static inline struct mount *next_slave(struct mount *p)
 {
-	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
+	return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave);
 }
 
 static struct mount *get_peer_under_root(struct mount *mnt,
@@ -85,21 +85,18 @@ static struct mount *propagation_source(struct mount *mnt)
 
 static void transfer_propagation(struct mount *mnt, struct mount *to)
 {
-	struct mount *slave_mnt;
-	if (!to) {
-		struct list_head *p = &mnt->mnt_slave_list;
-		while (!list_empty(p)) {
-			slave_mnt = list_first_entry(p,
-					struct mount, mnt_slave);
-			list_del_init(&slave_mnt->mnt_slave);
-			slave_mnt->mnt_master = NULL;
-		}
-		return;
+	struct hlist_node *p = NULL, *n;
+	struct mount *m;
+
+	hlist_for_each_entry_safe(m, n, &mnt->mnt_slave_list, mnt_slave) {
+		m->mnt_master = to;
+		if (!to)
+			hlist_del_init(&m->mnt_slave);
+		else
+			p = &m->mnt_slave;
 	}
-	list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
-		slave_mnt->mnt_master = to;
-	list_splice(&mnt->mnt_slave_list, to->mnt_slave_list.prev);
-	INIT_LIST_HEAD(&mnt->mnt_slave_list);
+	if (p)
+		hlist_splice_init(&mnt->mnt_slave_list, p, &to->mnt_slave_list);
 }
 
 /*
@@ -124,10 +121,10 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		transfer_propagation(mnt, m);
 		mnt->mnt_master = m;
 	}
-	list_del_init(&mnt->mnt_slave);
+	hlist_del_init(&mnt->mnt_slave);
 	if (type == MS_SLAVE) {
 		if (mnt->mnt_master)
-			list_add(&mnt->mnt_slave,
+			hlist_add_head(&mnt->mnt_slave,
 				 &mnt->mnt_master->mnt_slave_list);
 	} else {
 		mnt->mnt_master = NULL;
@@ -147,7 +144,7 @@ static struct mount *__propagation_next(struct mount *m,
 		if (master == origin->mnt_master) {
 			struct mount *next = next_peer(m);
 			return (next == origin) ? NULL : next;
-		} else if (m->mnt_slave.next != &master->mnt_slave_list)
+		} else if (m->mnt_slave.next)
 			return next_slave(m);
 
 		/* back at master */
@@ -169,7 +166,7 @@ static struct mount *propagation_next(struct mount *m,
 					 struct mount *origin)
 {
 	/* are there any slaves of this mount? */
-	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+	if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list))
 		return first_slave(m);
 
 	return __propagation_next(m, origin);
@@ -194,7 +191,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 	while (1) {
 		while (1) {
 			struct mount *next;
-			if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+			if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list))
 				return first_slave(m);
 			next = next_peer(m);
 			if (m->mnt_group_id == origin->mnt_group_id) {
@@ -207,7 +204,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 		/* m is the last peer */
 		while (1) {
 			struct mount *master = m->mnt_master;
-			if (m->mnt_slave.next != &master->mnt_slave_list)
+			if (m->mnt_slave.next)
 				return next_slave(m);
 			m = next_peer(master);
 			if (master->mnt_group_id == origin->mnt_group_id)

From dd5a4e1d640bf3542c4583491e6b91d25de3b760 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Jun 2025 23:54:41 -0400
Subject: [PATCH 43/48] change_mnt_propagation(): move ->mnt_master assignment
 into MS_SLAVE case

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 69278079faeb..cbf5f5746252 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -104,13 +104,14 @@ static void transfer_propagation(struct mount *mnt, struct mount *to)
  */
 void change_mnt_propagation(struct mount *mnt, int type)
 {
+	struct mount *m = mnt->mnt_master;
+
 	if (type == MS_SHARED) {
 		set_mnt_shared(mnt);
 		return;
 	}
 	if (IS_MNT_SHARED(mnt)) {
-		struct mount *m = propagation_source(mnt);
-
+		m = propagation_source(mnt);
 		if (list_empty(&mnt->mnt_share)) {
 			mnt_release_group_id(mnt);
 		} else {
@@ -119,13 +120,12 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		}
 		CLEAR_MNT_SHARED(mnt);
 		transfer_propagation(mnt, m);
-		mnt->mnt_master = m;
 	}
 	hlist_del_init(&mnt->mnt_slave);
 	if (type == MS_SLAVE) {
-		if (mnt->mnt_master)
-			hlist_add_head(&mnt->mnt_slave,
-				 &mnt->mnt_master->mnt_slave_list);
+		mnt->mnt_master = m;
+		if (m)
+			hlist_add_head(&mnt->mnt_slave, &m->mnt_slave_list);
 	} else {
 		mnt->mnt_master = NULL;
 		if (type == MS_UNBINDABLE)

From 663206854f020ec6fc6bfd3d52f501a28ede1403 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 17 Jun 2025 21:35:22 -0400
Subject: [PATCH 44/48] copy_tree(): don't link the mounts via mnt_list

The only place that really needs to be adjusted is commit_tree() -
there we need to iterate through the copy and we might as well
use next_mnt() for that.  However, in case when our tree has been
slid under something already mounted (propagation to a mountpoint
that already has something mounted on it or a 'beneath' move_mount)
we need to take care not to walk into the overmounting tree.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mount.h     |  3 +--
 fs/namespace.c | 60 ++++++++++++++++++++------------------------------
 fs/pnode.c     |  3 ++-
 3 files changed, 27 insertions(+), 39 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index 08583428b10b..97737051a8b9 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -193,7 +193,7 @@ static inline bool mnt_ns_empty(const struct mnt_namespace *ns)
 	return RB_EMPTY_ROOT(&ns->mounts);
 }
 
-static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
+static inline void move_from_ns(struct mount *mnt)
 {
 	struct mnt_namespace *ns = mnt->mnt_ns;
 	WARN_ON(!mnt_ns_attached(mnt));
@@ -203,7 +203,6 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
 		ns->mnt_first_node = rb_next(&mnt->mnt_node);
 	rb_erase(&mnt->mnt_node, &ns->mounts);
 	RB_CLEAR_NODE(&mnt->mnt_node);
-	list_add_tail(&mnt->mnt_list, dt_list);
 }
 
 bool has_locked_children(struct mount *mnt, struct dentry *dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index 38a46b32413d..bd6c7da901fc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1161,34 +1161,6 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
 	mnt_notify_add(mnt);
 }
 
-/*
- * vfsmount lock must be held for write
- */
-static void commit_tree(struct mount *mnt)
-{
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
-	LIST_HEAD(head);
-	struct mnt_namespace *n = parent->mnt_ns;
-
-	BUG_ON(parent == mnt);
-
-	if (!mnt_ns_attached(mnt)) {
-		list_add_tail(&head, &mnt->mnt_list);
-		while (!list_empty(&head)) {
-			m = list_first_entry(&head, typeof(*m), mnt_list);
-			list_del(&m->mnt_list);
-
-			mnt_add_to_ns(n, m);
-		}
-		n->nr_mounts += n->pending_mounts;
-		n->pending_mounts = 0;
-	}
-
-	make_visible(mnt);
-	touch_mnt_namespace(n);
-}
-
 static struct mount *next_mnt(struct mount *p, struct mount *root)
 {
 	struct list_head *next = p->mnt_mounts.next;
@@ -1215,6 +1187,27 @@ static struct mount *skip_mnt_tree(struct mount *p)
 	return p;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
+static void commit_tree(struct mount *mnt)
+{
+	struct mnt_namespace *n = mnt->mnt_parent->mnt_ns;
+
+	if (!mnt_ns_attached(mnt)) {
+		for (struct mount *m = mnt; m; m = next_mnt(m, mnt))
+			if (unlikely(mnt_ns_attached(m)))
+				m = skip_mnt_tree(m);
+			else
+				mnt_add_to_ns(n, m);
+		n->nr_mounts += n->pending_mounts;
+		n->pending_mounts = 0;
+	}
+
+	make_visible(mnt);
+	touch_mnt_namespace(n);
+}
+
 /**
  * vfs_create_mount - Create a mount for a configured superblock
  * @fc: The configuration context with the superblock attached
@@ -1831,9 +1824,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		p->mnt.mnt_flags |= MNT_UMOUNT;
 		if (mnt_ns_attached(p))
-			move_from_ns(p, &tmp_list);
-		else
-			list_move(&p->mnt_list, &tmp_list);
+			move_from_ns(p);
+		list_add_tail(&p->mnt_list, &tmp_list);
 	}
 
 	/* Hide the mounts from mnt_mounts */
@@ -2270,7 +2262,6 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
 					list_add(&dst_mnt->mnt_expire,
 						 &src_mnt->mnt_expire);
 			}
-			list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
 			attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp);
 			unlock_mount_hash();
 		}
@@ -2686,12 +2677,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		list_del_init(&source_mnt->mnt_expire);
 	} else {
 		if (source_mnt->mnt_ns) {
-			LIST_HEAD(head);
-
 			/* move from anon - the caller will destroy */
 			for (p = source_mnt; p; p = next_mnt(p, source_mnt))
-				move_from_ns(p, &head);
-			list_del_init(&head);
+				move_from_ns(p);
 		}
 	}
 
diff --git a/fs/pnode.c b/fs/pnode.c
index cbf5f5746252..81f7599bdac4 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -449,7 +449,8 @@ static void umount_one(struct mount *m, struct list_head *to_umount)
 {
 	m->mnt.mnt_flags |= MNT_UMOUNT;
 	list_del_init(&m->mnt_child);
-	move_from_ns(m, to_umount);
+	move_from_ns(m);
+	list_add_tail(&m->mnt_list, to_umount);
 }
 
 static void remove_from_candidate_list(struct mount *m)

From aab771f34e63ef89e195b63d121abcb55eebfde6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 18 Jun 2025 18:23:41 -0400
Subject: [PATCH 45/48] take freeing of emptied mnt_namespace to
 namespace_unlock()

Freeing of a namespace must be delayed until after we'd dealt with mount
notifications (in namespace_unlock()).  The reasons are not immediately
obvious (they are buried in ->prev_ns handling in mnt_notify()), and
having that free_mnt_ns() explicitly called after namespace_unlock()
is asking for trouble - it does feel like they should be OK to free
as soon as they've been emptied.

Make the things more explicit by setting 'emptied_ns' under namespace_sem
and having namespace_unlock() free the sucker as soon as it's safe to free.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index bd6c7da901fc..85db0de5fb53 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -79,6 +79,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */
 static DEFINE_SEQLOCK(mnt_ns_tree_lock);
 
 #ifdef CONFIG_FSNOTIFY
@@ -1730,15 +1731,18 @@ static bool need_notify_mnt_list(void)
 }
 #endif
 
+static void free_mnt_ns(struct mnt_namespace *);
 static void namespace_unlock(void)
 {
 	struct hlist_head head;
 	struct hlist_node *p;
 	struct mount *m;
+	struct mnt_namespace *ns = emptied_ns;
 	LIST_HEAD(list);
 
 	hlist_move_list(&unmounted, &head);
 	list_splice_init(&ex_mountpoints, &list);
+	emptied_ns = NULL;
 
 	if (need_notify_mnt_list()) {
 		/*
@@ -1752,6 +1756,11 @@ static void namespace_unlock(void)
 	} else {
 		up_write(&namespace_sem);
 	}
+	if (unlikely(ns)) {
+		/* Make sure we notice when we leak mounts. */
+		VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
+		free_mnt_ns(ns);
+	}
 
 	shrink_dentry_list(&list);
 
@@ -2335,12 +2344,10 @@ void drop_collected_paths(struct path *paths, struct path *prealloc)
 		kfree(paths);
 }
 
-static void free_mnt_ns(struct mnt_namespace *);
 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
 
 void dissolve_on_fput(struct vfsmount *mnt)
 {
-	struct mnt_namespace *ns;
 	struct mount *m = real_mount(mnt);
 
 	/*
@@ -2362,15 +2369,11 @@ void dissolve_on_fput(struct vfsmount *mnt)
 		if (!anon_ns_root(m))
 			return;
 
-		ns = m->mnt_ns;
+		emptied_ns = m->mnt_ns;
 		lock_mount_hash();
 		umount_tree(m, UMOUNT_CONNECTED);
 		unlock_mount_hash();
 	}
-
-	/* Make sure we notice when we leak mounts. */
-	VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
-	free_mnt_ns(ns);
 }
 
 static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
@@ -2678,6 +2681,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	} else {
 		if (source_mnt->mnt_ns) {
 			/* move from anon - the caller will destroy */
+			emptied_ns = source_mnt->mnt_ns;
 			for (p = source_mnt; p; p = next_mnt(p, source_mnt))
 				move_from_ns(p);
 		}
@@ -3656,13 +3660,6 @@ static int do_move_mount(struct path *old_path,
 	err = attach_recursive_mnt(old, p, mp.mp);
 out:
 	unlock_mount(&mp);
-	if (!err) {
-		if (is_anon_ns(ns)) {
-			/* Make sure we notice when we leak mounts. */
-			VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
-			free_mnt_ns(ns);
-		}
-	}
 	return err;
 }
 
@@ -6153,11 +6150,11 @@ void put_mnt_ns(struct mnt_namespace *ns)
 	if (!refcount_dec_and_test(&ns->ns.count))
 		return;
 	namespace_lock();
+	emptied_ns = ns;
 	lock_mount_hash();
 	umount_tree(ns->root, 0);
 	unlock_mount_hash();
 	namespace_unlock();
-	free_mnt_ns(ns);
 }
 
 struct vfsmount *kern_mount(struct file_system_type *type)

From 725ab435ff6e31faca26b8234f9f04c19f772b18 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 25 Jun 2025 14:48:50 -0400
Subject: [PATCH 46/48] get rid of CL_SHARE_TO_SLAVE

the only difference between it and CL_SLAVE is in this predicate
in clone_mnt():
	if ((flag & CL_SLAVE) ||
	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
However, in case of CL_SHARED_TO_SLAVE we have not allocated any
mount group ids since the time we'd grabbed namespace_sem, so
IS_MNT_SHARED() is equivalent to non-zero ->mnt_group_id.  And
in case of CL_SLAVE old has come either from the original tree,
which had ->mnt_group_id allocated for all nodes or from result
of sequence of CL_MAKE_SHARED or CL_MAKE_SHARED|CL_SLAVE copies,
ultimately going back to the original tree.  In both cases we are
guaranteed that old->mnt_group_id will be non-zero.

In other words, the predicate is always equal to
	(flags & (CL_SLAVE | CL_SHARED_TO_SLAVE)) && old->mnt_group_id
and with that replacement CL_SLAVE and CL_SHARED_TO_SLAVE have exact
same behaviour.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 7 +++----
 fs/pnode.h     | 1 -
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 85db0de5fb53..ca36c4a6a143 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1309,7 +1309,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) &
 			     ~MNT_INTERNAL_FLAGS;
 
-	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
+	if (flag & (CL_SLAVE | CL_PRIVATE))
 		mnt->mnt_group_id = 0; /* not a peer of original */
 	else
 		mnt->mnt_group_id = old->mnt_group_id;
@@ -1340,8 +1340,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	if (peers(mnt, old))
 		list_add(&mnt->mnt_share, &old->mnt_share);
 
-	if ((flag & CL_SLAVE) ||
-	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
+	if ((flag & CL_SLAVE) && old->mnt_group_id) {
 		hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list);
 		mnt->mnt_master = old;
 	} else if (IS_MNT_SLAVE(old)) {
@@ -4228,7 +4227,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	/* First pass: copy the tree topology */
 	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
 	if (user_ns != ns->user_ns)
-		copy_flags |= CL_SHARED_TO_SLAVE;
+		copy_flags |= CL_SLAVE;
 	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
 		namespace_unlock();
diff --git a/fs/pnode.h b/fs/pnode.h
index 507e30e7a420..00ab153e3e9d 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -25,7 +25,6 @@
 #define CL_COPY_UNBINDABLE	0x04
 #define CL_MAKE_SHARED 		0x08
 #define CL_PRIVATE 		0x10
-#define CL_SHARED_TO_SLAVE	0x20
 #define CL_COPY_MNT_NS_FILE	0x40
 
 /*

From f6cc2f4e3d304c93b44c80f50430aa40e080cc3c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 25 Jun 2025 15:02:11 -0400
Subject: [PATCH 47/48] invent_group_ids(): zero ->mnt_group_id always implies
 !IS_MNT_SHARED()

All places where we call set_mnt_shared() are guaranteed to have
non-zero ->mnt_group_id - either by explicit test, or by having
done successful invent_group_ids() covering the same mount since
we'd grabbed namespace_sem.

The opposite combination (non-zero ->mnt_group_id and !IS_MNT_SHARED())
*is* possible - it means that we have allocated group id, but didn't
get around to set_mnt_shared() yet; such state is transient -
by the time we do namespace_unlock(), we must either do set_mnt_shared()
or unroll the group id allocations by cleanup_group_ids().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index ca36c4a6a143..a75438121417 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2516,7 +2516,7 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
 	struct mount *p;
 
 	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
-		if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
+		if (!p->mnt_group_id) {
 			int err = mnt_alloc_group_id(p);
 			if (err) {
 				cleanup_group_ids(mnt, p);

From a7cce099450f8fc597a6ac215440666610895fb7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 25 Jun 2025 15:10:52 -0400
Subject: [PATCH 48/48] statmount_mnt_basic(): simplify the logics for group id

We are holding namespace_sem shared and we have not done any group
id allocations since we grabbed it.  Therefore IS_MNT_SHARED(m)
is equivalent to non-zero m->mnt_group_id.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index a75438121417..c549bd39c210 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5298,7 +5298,7 @@ static void statmount_mnt_basic(struct kstatmount *s)
 	s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
 	s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
 	s->sm.mnt_propagation = mnt_to_propagation_flags(m);
-	s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
+	s->sm.mnt_peer_group = m->mnt_group_id;
 	s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
 }