git: 0de03c306c2f - main - man9: Add an smr(9) manual page

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Mon, 20 Feb 2023 14:17:54 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=0de03c306c2f95580e56c65b74ebf8c04c1416b3

commit 0de03c306c2f95580e56c65b74ebf8c04c1416b3
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2023-02-20 13:58:19 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2023-02-20 13:58:19 +0000

    man9: Add an smr(9) manual page
    
    Also update the UMA manual page to mention its SMR-enabled
    functionality, and update locking.9 to mention both epoch and SMR.
    Details of its usage are provided in the SMR manual page.
    
    Reviewed by:    Olivier Certner, mhorne, kib
    MFC after:      2 weeks
    Sponsored by:   The FreeBSD Foundation
    Differential Revision:  https://reviews.freebsd.org/D38108
---
 share/man/man9/Makefile  |   9 ++
 share/man/man9/locking.9 |  37 ++++--
 share/man/man9/smr.9     | 294 +++++++++++++++++++++++++++++++++++++++++++++++
 share/man/man9/zone.9    |  42 ++++++-
 4 files changed, 373 insertions(+), 9 deletions(-)

diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile
index b2c141210ab9..ab6f146fa2a9 100644
--- a/share/man/man9/Makefile
+++ b/share/man/man9/Makefile
@@ -319,6 +319,7 @@ MAN=	accept_filter.9 \
 	signal.9 \
 	sleep.9 \
 	sleepqueue.9 \
+	smr.9 \
 	socket.9 \
 	stack.9 \
 	store.9 \
@@ -2051,6 +2052,14 @@ MLINKS+=sleepqueue.9 init_sleepqueues.9 \
 	sleepqueue.9 sleepq_type.9 \
 	sleepqueue.9 sleepq_wait.9 \
 	sleepqueue.9 sleepq_wait_sig.9
+MLINKS+=smr.9 smr_advance.9 \
+	smr.9 smr_create.9 \
+	smr.9 smr_destroy.9 \
+	smr.9 smr_enter.9 \
+	smr.9 smr_exit.9 \
+	smr.9 smr_poll.9 \
+	smr.9 smr_synchronize.9 \
+	smr.9 smr_wait.9
 MLINKS+=socket.9 soabort.9 \
 	socket.9 soaccept.9 \
 	socket.9 sobind.9 \
diff --git a/share/man/man9/locking.9 b/share/man/man9/locking.9
index 145ae6e9f1f5..e1e4ccd33664 100644
--- a/share/man/man9/locking.9
+++ b/share/man/man9/locking.9
@@ -24,7 +24,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd March 29, 2022
+.Dd February 3, 2023
 .Dt LOCKING 9
 .Os
 .Sh NAME
@@ -145,6 +145,32 @@ for this reason, they should be avoided.
 See
 .Xr lock 9
 for details.
+.Ss Non-blocking synchronization
+The kernel has two facilities,
+.Xr epoch 9
+and
+.Xr smr 9 ,
+which can be used to provide read-only access to a data structure while one or
+more writers are concurrently modifying the data structure.
+Specifically, readers using
+.Xr epoch 9
+and
+.Xr smr 9
+to synchronize accesses do not block writers, in contrast with reader/writer
+locks, and they help ensure that memory freed by writers is not reused until
+all readers which may be accessing it have finished.
+Thus, they are a useful building block in the construction of lock-free
+data structures.
+.Pp
+These facilities are difficult to use correctly and should be avoided
+in preference to traditional mutual exclusion-based synchronization,
+except when performance or non-blocking guarantees are a major concern.
+.Pp
+See
+.Xr epoch 9
+and
+.Xr smr 9
+for details.
 .Ss Counting semaphores
 Counting semaphores provide a mechanism for synchronizing access
 to a pool of resources.
@@ -394,8 +420,10 @@ At this time this is a rather easy to remember table.
 .Sh SEE ALSO
 .Xr lockstat 1 ,
 .Xr witness 4 ,
+.Xr atomic 9 ,
 .Xr BUS_SETUP_INTR 9 ,
 .Xr condvar 9 ,
+.Xr epoch 9 ,
 .Xr lock 9 ,
 .Xr LOCK_PROFILING 9 ,
 .Xr mtx_pool 9 ,
@@ -404,13 +432,8 @@ At this time this is a rather easy to remember table.
 .Xr rwlock 9 ,
 .Xr sema 9 ,
 .Xr sleep 9 ,
+.Xr smr 9 ,
 .Xr sx 9 ,
 .Xr timeout 9
-.Sh HISTORY
-These
-functions appeared in
-.Bsx 4.1
-through
-.Fx 7.0 .
 .Sh BUGS
 There are too many locking primitives to choose from.
diff --git a/share/man/man9/smr.9 b/share/man/man9/smr.9
new file mode 100644
index 000000000000..7b2ed65b4368
--- /dev/null
+++ b/share/man/man9/smr.9
@@ -0,0 +1,294 @@
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2023 The FreeBSD Foundation
+.\"
+.\" This documentation was written by Mark Johnston <markj@FreeBSD.org>
+.\" under sponsorship from the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd January 17, 2023
+.Dt SMR 9
+.Os
+.Sh NAME
+.Nm smr
+.Nd safe memory reclamation for lock-free data structures
+.Sh SYNOPSIS
+.In sys/smr.h
+.Ft smr_seq_t
+.Fo smr_advance
+.Fa "smr_t smr"
+.Fc
+.Ft smr_t
+.Fo smr_create
+.Fa "const char *name"
+.Fc
+.Ft void
+.Fo smr_destroy
+.Fa "smr_t smr"
+.Fc
+.Ft void
+.Fo smr_enter
+.Fa "smr_t smr"
+.Fc
+.Ft void
+.Fo smr_exit
+.Fa "smr_t smr"
+.Fc
+.Ft bool
+.Fo smr_poll
+.Fa "smr_t smr"
+.Fa "smr_seq_t goal"
+.Fa "bool wait"
+.Fc
+.Ft void
+.Fo smr_synchronize
+.Fa "smr_t smr"
+.Fc
+.Ft bool
+.Fo smr_wait
+.Fa "smr_t smr"
+.Fa "smr_seq_t goal"
+.Fc
+.Sh DESCRIPTION
+Safe Memory Reclamation (SMR) is a facility which enables the implementation of
+memory-safe lock-free data structures.
+In typical usage, read accesses to an SMR-protected data structure, such as a
+hash table or tree, are performed in a
+.Dq read section
+consisting of code bracketed by
+.Fn smr_enter
+and
+.Fn smr_exit
+calls, while mutations of the data structure are serialized by a traditional
+mutex such as
+.Xr mutex 9 .
+In contrast with reader-writer locks such as
+.Xr rwlock 9 ,
+.Xr rmlock 9 ,
+and
+.Xr sx 9 ,
+SMR allows readers and writers to access the data structure concurrently.
+Readers can always enter a read section immediately
+.Po
+.Fn smr_enter
+never blocks
+.Pc ,
+so mutations do not introduce read latency.
+Furthermore,
+.Fn smr_enter
+and
+.Fn smr_exit
+operate only on per-CPU data and thus avoid some of the performance problems
+inherent in the implementation of traditional reader-writer mutexes.
+SMR can therefore be a useful building block for data structures which are
+accessed frequently but are only rarely modified.
+.Pp
+Note that any SMR-protected data structure must be implemented carefully such
+that operations behave correctly in the absence of mutual exclusion between
+readers and writers.
+The data structure must be designed to be lock-free; SMR merely facilitates
+the implementation, for example by making it safe to follow dangling pointers
+and by helping avoid the ABA problem.
+.Pp
+When shared accesses to and mutations of a data structure can proceed
+concurrently, writers must take care to ensure that any items removed from the
+structure are not freed and recycled while readers are accessing them in
+parallel.
+This requirement results in a two-phase approach to the removal of items:
+first, the item is unlinked such that all pointers to the item are removed from
+the structure, preventing any new readers from observing the item.
+Then, the writer waits until some mechanism guarantees that no existing readers
+are still accessing the item.
+At that point the memory for that item can be freed and reused safely.
+SMR provides this mechanism: readers may access a lock-free data structure in
+between calls to the
+.Fn smr_enter
+and
+.Fn smr_exit
+functions, which together create a read section, and the
+.Fn smr_advance ,
+.Fn smr_poll ,
+.Fn smr_wait ,
+and
+.Fn smr_synchronize
+functions can be used to wait for threads in read sections to finish.
+All of these functions operate on a
+.Ft smr_t
+state block which holds both per-CPU and global state.
+Readers load global state and modify per-CPU state, while writers must scan all
+per-CPU states to detect active readers.
+SMR is designed to amortize this cost by batching to give acceptable
+performance in write-heavy workloads.
+.Ss Readers
+Threads enter a read section by calling
+.Fn smr_enter .
+Read sections should be short, and many operations are not permitted while in
+a read section.
+Specifically, context switching is disabled, and thus readers may not acquire
+blocking mutexes such as
+.Xr mutex 9 .
+Another consequence of this is that the thread is pinned to the current CPU for
+the duration of the read section.
+Furthermore, read sections may not be nested: it is incorrect to call
+.Fn smr_enter
+with a given
+.Ft smr_t
+state block when already in a read section for that state block.
+.Ss UMA Integration
+To simplify the integration of SMR into consumers, the
+.Xr uma 9
+kernel memory allocator provides some SMR-specified facilities.
+This eliminates a good deal of complexity from the implementation of consumers
+and automatically batches write operations.
+.Pp
+In typical usage, a UMA zone (created with the
+.Dv UMA_ZONE_SMR
+flag or initialized with the
+.Fn uma_zone_set_smr
+function) is coupled with a
+.Ft smr_t
+state block.
+To insert an item into an SMR-protected data structure, memory is allocated
+from the zone using the
+.Fn uma_zalloc_smr
+function.
+Insertions and removals are serialized using traditional mutual exclusion
+and items are freed using the
+.Fn uma_zfree_smr
+function.
+Read-only lookup operations are performed in SMR read sections.
+.Fn uma_zfree_smr
+waits for all active readers which may be accessing the freed item to finish
+their read sections before recycling that item's memory.
+.Pp
+If the zone has an associated per-item destructor, it will be invoked at some
+point when no readers can be accessing a given item.
+The destructor can be used to release additional resources associated with the
+item.
+Note however that there is no guarantee that the destructor will be invoked in
+a bounded time period.
+.Ss Writers
+Consumers are expected to use SMR in conjunction with UMA and thus need only
+make use of the
+.Fn smr_enter
+and
+.Fn smr_exit
+functions, and the SMR helper macros defined in
+.Pa sys/smr_types.h .
+However, an introduction to the write-side interface of SMR can be useful.
+.Pp
+Internally, SMR maintains a global
+.Ql write sequence
+number.
+When entering a read section,
+.Fn smr_enter
+loads a copy of the write sequence and stores it in per-CPU memory, hence
+.Ql observing
+that value.
+To exit a read section, this per-CPU memory is overwritten with an invalid
+value, making the CPU inactive.
+Writers perform two operations: advancing the write sequence number, and
+polling all CPUs to see whether active readers have observed a given sequence
+number.
+These operations are performed by
+.Fn smr_advance
+and
+.Fn smr_poll ,
+respectively, which do not require serialization between multiple writers.
+.Pp
+After a writer unlinks an item from a data structure, it increments the write
+sequence number and tags the item with the new value returned by
+.Fn smr_advance .
+Once all CPUs have observed the new value, the writer can use
+.Fn smr_poll
+to deduce that no active readers have access to the unlinked item, and thus the
+item is safe to recycle.
+Because this pair of operations is relatively expensive, it is generally a good
+idea to amortize this cost by accumulating a collection of multiple unlinked
+items and tagging the entire batch with a target write sequence number.
+.Pp
+.Fn smr_poll
+is a non-blocking operation and returns true only if all active readers are
+guaranteed to have observed the target sequence number value.
+.Fn smr_wait
+is a variant of
+.Fn smr_poll
+which waits until all CPUs have observed the target sequence number value.
+.Fn smr_synchronize
+combines
+.Fn smr_advance
+with
+.Fn smr_wait
+to wait for all CPUs to observe a new write sequence number.
+This is an expensive operation and should only be used if polling cannot be
+deferred in some way.
+.Ss Memory Ordering
+The
+.Fn smr_enter
+function has acquire semantics, and the
+.Fn smr_exit
+function has release semantics.
+The
+.Fn smr_advance ,
+.Fn smr_poll ,
+.Fn smr_wait ,
+and
+.Fn smr_synchronize
+functions should not be assumed to have any guarantees with respect to memory
+ordering.
+In practice, some of these functions have stronger ordering semantics than
+is stated here, but this is specific to the implementation and should not be
+relied upon.
+See
+.Xr atomic 9
+for more details.
+.Sh NOTES
+Outside of
+.Fx
+the acronym SMR typically refers to a family of algorithms which enable
+memory-safe read-only access to a data structure concurrent with modifications
+to that data structure.
+Here, SMR refers to a particular algorithm belonging to this family, as well as
+its implementation in
+.Fx .
+See
+.Pa sys/sys/smr.h
+and
+.Pa sys/kern/subr_smr.c
+in the
+.Fx
+source tree for further details on the algorithm and the context.
+.Pp
+The acronym SMR is also used to mean "shingled magnetic recording", a
+technology used to store data on hard disk drives which requires operating
+system support.
+These two uses of the acronym are unrelated.
+.Sh SEE ALSO
+.Xr atomic 9 ,
+.Xr locking 9 ,
+.Xr uma 9
+.Sh AUTHORS
+The SMR algorithm and its implementation were provided by
+.An Jeff Roberson Aq Mt jeff@FreeBSD.org .
+This manual page was written by
+.An Mark Johnston Aq Mt markj@FreeBSD.org .
diff --git a/share/man/man9/zone.9 b/share/man/man9/zone.9
index 5c1b4d31d8aa..5f7f99b8f44f 100644
--- a/share/man/man9/zone.9
+++ b/share/man/man9/zone.9
@@ -25,7 +25,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd February 15, 2022
+.Dd January 16, 2023
 .Dt UMA 9
 .Os
 .Sh NAME
@@ -79,6 +79,8 @@ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);
 .Fn uma_zalloc_pcpu "uma_zone_t zone" "int flags"
 .Ft "void *"
 .Fn uma_zalloc_pcpu_arg "uma_zone_t zone" "void *arg" "int flags"
+.Ft "void *"
+.Fn uma_zalloc_smr "uma_zone_t zone" "int flags"
 .Ft void
 .Fn uma_zfree "uma_zone_t zone" "void *item"
 .Ft void
@@ -88,6 +90,8 @@ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);
 .Ft void
 .Fn uma_zfree_pcpu_arg "uma_zone_t zone" "void *item" "void *arg"
 .Ft void
+.Fn uma_zfree_smr "uma_zone_t zone" "void *item"
+.Ft void
 .Fn uma_prealloc "uma_zone_t zone" "int nitems"
 .Ft void
 .Fn uma_zone_reserve "uma_zone_t zone" "int nitems"
@@ -117,6 +121,10 @@ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);
 .Fn uma_zone_set_warning "uma_zone_t zone" "const char *warning"
 .Ft void
 .Fn uma_zone_set_maxaction "uma_zone_t zone" "void (*maxaction)(uma_zone_t)"
+.Ft smr_t
+.Fn uma_zone_get_smr "uma_zone_t zone"
+.Ft void
+.Fn uma_zone_set_smr "uma_zone_t zone" "smr_t smr"
 .In sys/sysctl.h
 .Fn SYSCTL_UMA_MAX parent nbr name access zone descr
 .Fn SYSCTL_ADD_UMA_MAX ctx parent nbr name access zone descr
@@ -330,6 +338,14 @@ Cached items that have not been used for a long period may also be freed from
 zone.
 When this flag is set, the system will not reclaim memory from the zone's
 caches.
+.It Dv UMA_ZONE_SMR
+Create a zone whose items will be synchronized using the
+.Xr smr 9
+mechanism.
+Upon creation the zone will have an associated
+.Dt smr_t
+structure which can be fetched using
+.Fn uma_zone_get_smr .
 .El
 .Pp
 Zones can be destroyed using
@@ -390,6 +406,17 @@ then
 does nothing.
 .Pp
 The
+.Fn uma_zalloc_smr
+and
+.Fn uma_zfree_smr
+functions allocate and free items from an SMR-enabled zone, that is,
+a zone created with
+.Dv UMA_ZONE_SMR
+or a zone that has had
+.Fn uma_zone_set_smr
+called.
+.Pp
+The
 .Fn uma_zalloc_domain
 function allows callers to specify a fixed
 .Xr numa 4
@@ -535,6 +562,16 @@ Therefore,
 this function should do very little work (similar to a signal handler).
 .Pp
 The
+.Fn uma_zone_set_smr
+function associates an existing
+.Xr smr 9
+structure with a UMA zone.
+The effect is similar to creating a zone with the
+.Dv UMA_ZONE_SMR
+flag, except that a new SMR structure is not created.
+This function must be called before any allocations from the zone are performed.
+.Pp
+The
 .Fn SYSCTL_UMA_MAX parent nbr name access zone descr
 macro declares a static
 .Xr sysctl 9
@@ -577,7 +614,8 @@ non-executable memory.
 .Sh SEE ALSO
 .Xr numa 4 ,
 .Xr vmstat 8 ,
-.Xr malloc 9
+.Xr malloc 9 ,
+.Xr smr 9
 .Rs
 .%A Jeff Bonwick
 .%T "The Slab Allocator: An Object-Caching Kernel Memory Allocator"