1*ee1ee6dbSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2*ee1ee6dbSThomas Gleixner
3*ee1ee6dbSThomas Gleixner /*
4*ee1ee6dbSThomas Gleixner * rcuref - A scalable reference count implementation for RCU managed objects
5*ee1ee6dbSThomas Gleixner *
6*ee1ee6dbSThomas Gleixner * rcuref is provided to replace open coded reference count implementations
7*ee1ee6dbSThomas Gleixner * based on atomic_t. It protects explicitely RCU managed objects which can
8*ee1ee6dbSThomas Gleixner * be visible even after the last reference has been dropped and the object
9*ee1ee6dbSThomas Gleixner * is heading towards destruction.
10*ee1ee6dbSThomas Gleixner *
11*ee1ee6dbSThomas Gleixner * A common usage pattern is:
12*ee1ee6dbSThomas Gleixner *
13*ee1ee6dbSThomas Gleixner * get()
14*ee1ee6dbSThomas Gleixner * rcu_read_lock();
15*ee1ee6dbSThomas Gleixner * p = get_ptr();
16*ee1ee6dbSThomas Gleixner * if (p && !atomic_inc_not_zero(&p->refcnt))
17*ee1ee6dbSThomas Gleixner * p = NULL;
18*ee1ee6dbSThomas Gleixner * rcu_read_unlock();
19*ee1ee6dbSThomas Gleixner * return p;
20*ee1ee6dbSThomas Gleixner *
21*ee1ee6dbSThomas Gleixner * put()
22*ee1ee6dbSThomas Gleixner * if (!atomic_dec_return(&->refcnt)) {
23*ee1ee6dbSThomas Gleixner * remove_ptr(p);
24*ee1ee6dbSThomas Gleixner * kfree_rcu((p, rcu);
25*ee1ee6dbSThomas Gleixner * }
26*ee1ee6dbSThomas Gleixner *
27*ee1ee6dbSThomas Gleixner * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has
28*ee1ee6dbSThomas Gleixner * O(N^2) behaviour under contention with N concurrent operations.
29*ee1ee6dbSThomas Gleixner *
30*ee1ee6dbSThomas Gleixner * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales
31*ee1ee6dbSThomas Gleixner * better under contention.
32*ee1ee6dbSThomas Gleixner *
33*ee1ee6dbSThomas Gleixner * Why not refcount?
34*ee1ee6dbSThomas Gleixner * =================
35*ee1ee6dbSThomas Gleixner *
36*ee1ee6dbSThomas Gleixner * In principle it should be possible to make refcount use the rcuref
37*ee1ee6dbSThomas Gleixner * scheme, but the destruction race described below cannot be prevented
38*ee1ee6dbSThomas Gleixner * unless the protected object is RCU managed.
39*ee1ee6dbSThomas Gleixner *
40*ee1ee6dbSThomas Gleixner * Theory of operation
41*ee1ee6dbSThomas Gleixner * ===================
42*ee1ee6dbSThomas Gleixner *
43*ee1ee6dbSThomas Gleixner * rcuref uses an unsigned integer reference counter. As long as the
44*ee1ee6dbSThomas Gleixner * counter value is greater than or equal to RCUREF_ONEREF and not larger
45*ee1ee6dbSThomas Gleixner * than RCUREF_MAXREF the reference is alive:
46*ee1ee6dbSThomas Gleixner *
47*ee1ee6dbSThomas Gleixner * ONEREF MAXREF SATURATED RELEASED DEAD NOREF
48*ee1ee6dbSThomas Gleixner * 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF
49*ee1ee6dbSThomas Gleixner * <---valid --------> <-------saturation zone-------> <-----dead zone----->
50*ee1ee6dbSThomas Gleixner *
51*ee1ee6dbSThomas Gleixner * The get() and put() operations do unconditional increments and
52*ee1ee6dbSThomas Gleixner * decrements. The result is checked after the operation. This optimizes
53*ee1ee6dbSThomas Gleixner * for the fast path.
54*ee1ee6dbSThomas Gleixner *
55*ee1ee6dbSThomas Gleixner * If the reference count is saturated or dead, then the increments and
56*ee1ee6dbSThomas Gleixner * decrements are not harmful as the reference count still stays in the
57*ee1ee6dbSThomas Gleixner * respective zones and is always set back to STATURATED resp. DEAD. The
58*ee1ee6dbSThomas Gleixner * zones have room for 2^28 racing operations in each direction, which
59*ee1ee6dbSThomas Gleixner * makes it practically impossible to escape the zones.
60*ee1ee6dbSThomas Gleixner *
61*ee1ee6dbSThomas Gleixner * Once the last reference is dropped the reference count becomes
62*ee1ee6dbSThomas Gleixner * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The
63*ee1ee6dbSThomas Gleixner * slowpath then tries to set the reference count from RCUREF_NOREF to
64*ee1ee6dbSThomas Gleixner * RCUREF_DEAD via a cmpxchg(). This opens a small window where a
65*ee1ee6dbSThomas Gleixner * concurrent rcuref_get() can acquire the reference count and bring it
66*ee1ee6dbSThomas Gleixner * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD.
67*ee1ee6dbSThomas Gleixner *
68*ee1ee6dbSThomas Gleixner * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in
69*ee1ee6dbSThomas Gleixner * DEAD + 1, which is inside the dead zone. If that happens the reference
70*ee1ee6dbSThomas Gleixner * count is put back to DEAD.
71*ee1ee6dbSThomas Gleixner *
72*ee1ee6dbSThomas Gleixner * The actual race is possible due to the unconditional increment and
73*ee1ee6dbSThomas Gleixner * decrements in rcuref_get() and rcuref_put():
74*ee1ee6dbSThomas Gleixner *
75*ee1ee6dbSThomas Gleixner * T1 T2
76*ee1ee6dbSThomas Gleixner * get() put()
77*ee1ee6dbSThomas Gleixner * if (atomic_add_negative(-1, &ref->refcnt))
78*ee1ee6dbSThomas Gleixner * succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);
79*ee1ee6dbSThomas Gleixner *
80*ee1ee6dbSThomas Gleixner * atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1
81*ee1ee6dbSThomas Gleixner *
82*ee1ee6dbSThomas Gleixner * As the result of T1's add is negative, the get() goes into the slow path
83*ee1ee6dbSThomas Gleixner * and observes refcnt being in the dead zone which makes the operation fail.
84*ee1ee6dbSThomas Gleixner *
85*ee1ee6dbSThomas Gleixner * Possible critical states:
86*ee1ee6dbSThomas Gleixner *
87*ee1ee6dbSThomas Gleixner * Context Counter References Operation
88*ee1ee6dbSThomas Gleixner * T1 0 1 init()
89*ee1ee6dbSThomas Gleixner * T2 1 2 get()
90*ee1ee6dbSThomas Gleixner * T1 0 1 put()
91*ee1ee6dbSThomas Gleixner * T2 -1 0 put() tries to mark dead
92*ee1ee6dbSThomas Gleixner * T1 0 1 get()
93*ee1ee6dbSThomas Gleixner * T2 0 1 put() mark dead fails
94*ee1ee6dbSThomas Gleixner * T1 -1 0 put() tries to mark dead
95*ee1ee6dbSThomas Gleixner * T1 DEAD 0 put() mark dead succeeds
96*ee1ee6dbSThomas Gleixner * T2 DEAD+1 0 get() fails and puts it back to DEAD
97*ee1ee6dbSThomas Gleixner *
98*ee1ee6dbSThomas Gleixner * Of course there are more complex scenarios, but the above illustrates
99*ee1ee6dbSThomas Gleixner * the working principle. The rest is left to the imagination of the
100*ee1ee6dbSThomas Gleixner * reader.
101*ee1ee6dbSThomas Gleixner *
102*ee1ee6dbSThomas Gleixner * Deconstruction race
103*ee1ee6dbSThomas Gleixner * ===================
104*ee1ee6dbSThomas Gleixner *
105*ee1ee6dbSThomas Gleixner * The release operation must be protected by prohibiting a grace period in
106*ee1ee6dbSThomas Gleixner * order to prevent a possible use after free:
107*ee1ee6dbSThomas Gleixner *
108*ee1ee6dbSThomas Gleixner * T1 T2
109*ee1ee6dbSThomas Gleixner * put() get()
110*ee1ee6dbSThomas Gleixner * // ref->refcnt = ONEREF
111*ee1ee6dbSThomas Gleixner * if (!atomic_add_negative(-1, &ref->refcnt))
112*ee1ee6dbSThomas Gleixner * return false; <- Not taken
113*ee1ee6dbSThomas Gleixner *
114*ee1ee6dbSThomas Gleixner * // ref->refcnt == NOREF
115*ee1ee6dbSThomas Gleixner * --> preemption
116*ee1ee6dbSThomas Gleixner * // Elevates ref->refcnt to ONEREF
117*ee1ee6dbSThomas Gleixner * if (!atomic_add_negative(1, &ref->refcnt))
118*ee1ee6dbSThomas Gleixner * return true; <- taken
119*ee1ee6dbSThomas Gleixner *
120*ee1ee6dbSThomas Gleixner * if (put(&p->ref)) { <-- Succeeds
121*ee1ee6dbSThomas Gleixner * remove_pointer(p);
122*ee1ee6dbSThomas Gleixner * kfree_rcu(p, rcu);
123*ee1ee6dbSThomas Gleixner * }
124*ee1ee6dbSThomas Gleixner *
125*ee1ee6dbSThomas Gleixner * RCU grace period ends, object is freed
126*ee1ee6dbSThomas Gleixner *
127*ee1ee6dbSThomas Gleixner * atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF
128*ee1ee6dbSThomas Gleixner *
129*ee1ee6dbSThomas Gleixner * This is prevented by disabling preemption around the put() operation as
130*ee1ee6dbSThomas Gleixner * that's in most kernel configurations cheaper than a rcu_read_lock() /
131*ee1ee6dbSThomas Gleixner * rcu_read_unlock() pair and in many cases even a NOOP. In any case it
132*ee1ee6dbSThomas Gleixner * prevents the grace period which keeps the object alive until all put()
133*ee1ee6dbSThomas Gleixner * operations complete.
134*ee1ee6dbSThomas Gleixner *
135*ee1ee6dbSThomas Gleixner * Saturation protection
136*ee1ee6dbSThomas Gleixner * =====================
137*ee1ee6dbSThomas Gleixner *
138*ee1ee6dbSThomas Gleixner * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX).
139*ee1ee6dbSThomas Gleixner * Once this is exceedded the reference count becomes stale by setting it
140*ee1ee6dbSThomas Gleixner * to RCUREF_SATURATED, which will cause a memory leak, but it prevents
141*ee1ee6dbSThomas Gleixner * wrap arounds which obviously cause worse problems than a memory
142*ee1ee6dbSThomas Gleixner * leak. When saturation is reached a warning is emitted.
143*ee1ee6dbSThomas Gleixner *
144*ee1ee6dbSThomas Gleixner * Race conditions
145*ee1ee6dbSThomas Gleixner * ===============
146*ee1ee6dbSThomas Gleixner *
147*ee1ee6dbSThomas Gleixner * All reference count increment/decrement operations are unconditional and
148*ee1ee6dbSThomas Gleixner * only verified after the fact. This optimizes for the good case and takes
149*ee1ee6dbSThomas Gleixner * the occasional race vs. a dead or already saturated refcount into
150*ee1ee6dbSThomas Gleixner * account. The saturation and dead zones are large enough to accomodate
151*ee1ee6dbSThomas Gleixner * for that.
152*ee1ee6dbSThomas Gleixner *
153*ee1ee6dbSThomas Gleixner * Memory ordering
154*ee1ee6dbSThomas Gleixner * ===============
155*ee1ee6dbSThomas Gleixner *
156*ee1ee6dbSThomas Gleixner * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
157*ee1ee6dbSThomas Gleixner * and provide only what is strictly required for refcounts.
158*ee1ee6dbSThomas Gleixner *
159*ee1ee6dbSThomas Gleixner * The increments are fully relaxed; these will not provide ordering. The
160*ee1ee6dbSThomas Gleixner * rationale is that whatever is used to obtain the object to increase the
161*ee1ee6dbSThomas Gleixner * reference count on will provide the ordering. For locked data
162*ee1ee6dbSThomas Gleixner * structures, its the lock acquire, for RCU/lockless data structures its
163*ee1ee6dbSThomas Gleixner * the dependent load.
164*ee1ee6dbSThomas Gleixner *
165*ee1ee6dbSThomas Gleixner * rcuref_get() provides a control dependency ordering future stores which
166*ee1ee6dbSThomas Gleixner * ensures that the object is not modified when acquiring a reference
167*ee1ee6dbSThomas Gleixner * fails.
168*ee1ee6dbSThomas Gleixner *
169*ee1ee6dbSThomas Gleixner * rcuref_put() provides release order, i.e. all prior loads and stores
170*ee1ee6dbSThomas Gleixner * will be issued before. It also provides a control dependency ordering
171*ee1ee6dbSThomas Gleixner * against the subsequent destruction of the object.
172*ee1ee6dbSThomas Gleixner *
173*ee1ee6dbSThomas Gleixner * If rcuref_put() successfully dropped the last reference and marked the
174*ee1ee6dbSThomas Gleixner * object DEAD it also provides acquire ordering.
175*ee1ee6dbSThomas Gleixner */
176*ee1ee6dbSThomas Gleixner
177*ee1ee6dbSThomas Gleixner #include <linux/export.h>
178*ee1ee6dbSThomas Gleixner #include <linux/rcuref.h>
179*ee1ee6dbSThomas Gleixner
180*ee1ee6dbSThomas Gleixner /**
181*ee1ee6dbSThomas Gleixner * rcuref_get_slowpath - Slowpath of rcuref_get()
182*ee1ee6dbSThomas Gleixner * @ref: Pointer to the reference count
183*ee1ee6dbSThomas Gleixner *
184*ee1ee6dbSThomas Gleixner * Invoked when the reference count is outside of the valid zone.
185*ee1ee6dbSThomas Gleixner *
186*ee1ee6dbSThomas Gleixner * Return:
187*ee1ee6dbSThomas Gleixner * False if the reference count was already marked dead
188*ee1ee6dbSThomas Gleixner *
189*ee1ee6dbSThomas Gleixner * True if the reference count is saturated, which prevents the
190*ee1ee6dbSThomas Gleixner * object from being deconstructed ever.
191*ee1ee6dbSThomas Gleixner */
rcuref_get_slowpath(rcuref_t * ref)192*ee1ee6dbSThomas Gleixner bool rcuref_get_slowpath(rcuref_t *ref)
193*ee1ee6dbSThomas Gleixner {
194*ee1ee6dbSThomas Gleixner unsigned int cnt = atomic_read(&ref->refcnt);
195*ee1ee6dbSThomas Gleixner
196*ee1ee6dbSThomas Gleixner /*
197*ee1ee6dbSThomas Gleixner * If the reference count was already marked dead, undo the
198*ee1ee6dbSThomas Gleixner * increment so it stays in the middle of the dead zone and return
199*ee1ee6dbSThomas Gleixner * fail.
200*ee1ee6dbSThomas Gleixner */
201*ee1ee6dbSThomas Gleixner if (cnt >= RCUREF_RELEASED) {
202*ee1ee6dbSThomas Gleixner atomic_set(&ref->refcnt, RCUREF_DEAD);
203*ee1ee6dbSThomas Gleixner return false;
204*ee1ee6dbSThomas Gleixner }
205*ee1ee6dbSThomas Gleixner
206*ee1ee6dbSThomas Gleixner /*
207*ee1ee6dbSThomas Gleixner * If it was saturated, warn and mark it so. In case the increment
208*ee1ee6dbSThomas Gleixner * was already on a saturated value restore the saturation
209*ee1ee6dbSThomas Gleixner * marker. This keeps it in the middle of the saturation zone and
210*ee1ee6dbSThomas Gleixner * prevents the reference count from overflowing. This leaks the
211*ee1ee6dbSThomas Gleixner * object memory, but prevents the obvious reference count overflow
212*ee1ee6dbSThomas Gleixner * damage.
213*ee1ee6dbSThomas Gleixner */
214*ee1ee6dbSThomas Gleixner if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory"))
215*ee1ee6dbSThomas Gleixner atomic_set(&ref->refcnt, RCUREF_SATURATED);
216*ee1ee6dbSThomas Gleixner return true;
217*ee1ee6dbSThomas Gleixner }
218*ee1ee6dbSThomas Gleixner EXPORT_SYMBOL_GPL(rcuref_get_slowpath);
219*ee1ee6dbSThomas Gleixner
220*ee1ee6dbSThomas Gleixner /**
221*ee1ee6dbSThomas Gleixner * rcuref_put_slowpath - Slowpath of __rcuref_put()
222*ee1ee6dbSThomas Gleixner * @ref: Pointer to the reference count
223*ee1ee6dbSThomas Gleixner *
224*ee1ee6dbSThomas Gleixner * Invoked when the reference count is outside of the valid zone.
225*ee1ee6dbSThomas Gleixner *
226*ee1ee6dbSThomas Gleixner * Return:
227*ee1ee6dbSThomas Gleixner * True if this was the last reference with no future references
228*ee1ee6dbSThomas Gleixner * possible. This signals the caller that it can safely schedule the
229*ee1ee6dbSThomas Gleixner * object, which is protected by the reference counter, for
230*ee1ee6dbSThomas Gleixner * deconstruction.
231*ee1ee6dbSThomas Gleixner *
232*ee1ee6dbSThomas Gleixner * False if there are still active references or the put() raced
233*ee1ee6dbSThomas Gleixner * with a concurrent get()/put() pair. Caller is not allowed to
234*ee1ee6dbSThomas Gleixner * deconstruct the protected object.
235*ee1ee6dbSThomas Gleixner */
rcuref_put_slowpath(rcuref_t * ref)236*ee1ee6dbSThomas Gleixner bool rcuref_put_slowpath(rcuref_t *ref)
237*ee1ee6dbSThomas Gleixner {
238*ee1ee6dbSThomas Gleixner unsigned int cnt = atomic_read(&ref->refcnt);
239*ee1ee6dbSThomas Gleixner
240*ee1ee6dbSThomas Gleixner /* Did this drop the last reference? */
241*ee1ee6dbSThomas Gleixner if (likely(cnt == RCUREF_NOREF)) {
242*ee1ee6dbSThomas Gleixner /*
243*ee1ee6dbSThomas Gleixner * Carefully try to set the reference count to RCUREF_DEAD.
244*ee1ee6dbSThomas Gleixner *
245*ee1ee6dbSThomas Gleixner * This can fail if a concurrent get() operation has
246*ee1ee6dbSThomas Gleixner * elevated it again or the corresponding put() even marked
247*ee1ee6dbSThomas Gleixner * it dead already. Both are valid situations and do not
248*ee1ee6dbSThomas Gleixner * require a retry. If this fails the caller is not
249*ee1ee6dbSThomas Gleixner * allowed to deconstruct the object.
250*ee1ee6dbSThomas Gleixner */
251*ee1ee6dbSThomas Gleixner if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF)
252*ee1ee6dbSThomas Gleixner return false;
253*ee1ee6dbSThomas Gleixner
254*ee1ee6dbSThomas Gleixner /*
255*ee1ee6dbSThomas Gleixner * The caller can safely schedule the object for
256*ee1ee6dbSThomas Gleixner * deconstruction. Provide acquire ordering.
257*ee1ee6dbSThomas Gleixner */
258*ee1ee6dbSThomas Gleixner smp_acquire__after_ctrl_dep();
259*ee1ee6dbSThomas Gleixner return true;
260*ee1ee6dbSThomas Gleixner }
261*ee1ee6dbSThomas Gleixner
262*ee1ee6dbSThomas Gleixner /*
263*ee1ee6dbSThomas Gleixner * If the reference count was already in the dead zone, then this
264*ee1ee6dbSThomas Gleixner * put() operation is imbalanced. Warn, put the reference count back to
265*ee1ee6dbSThomas Gleixner * DEAD and tell the caller to not deconstruct the object.
266*ee1ee6dbSThomas Gleixner */
267*ee1ee6dbSThomas Gleixner if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) {
268*ee1ee6dbSThomas Gleixner atomic_set(&ref->refcnt, RCUREF_DEAD);
269*ee1ee6dbSThomas Gleixner return false;
270*ee1ee6dbSThomas Gleixner }
271*ee1ee6dbSThomas Gleixner
272*ee1ee6dbSThomas Gleixner /*
273*ee1ee6dbSThomas Gleixner * This is a put() operation on a saturated refcount. Restore the
274*ee1ee6dbSThomas Gleixner * mean saturation value and tell the caller to not deconstruct the
275*ee1ee6dbSThomas Gleixner * object.
276*ee1ee6dbSThomas Gleixner */
277*ee1ee6dbSThomas Gleixner if (cnt > RCUREF_MAXREF)
278*ee1ee6dbSThomas Gleixner atomic_set(&ref->refcnt, RCUREF_SATURATED);
279*ee1ee6dbSThomas Gleixner return false;
280*ee1ee6dbSThomas Gleixner }
281*ee1ee6dbSThomas Gleixner EXPORT_SYMBOL_GPL(rcuref_put_slowpath);
282