1    	/*
2    	 * vim:noexpandtab:shiftwidth=8:tabstop=8:
3    	 *
4    	 * This program is free software; you can redistribute it and/or
5    	 * modify it under the terms of the GNU Lesser General Public
6    	 * License as published by the Free Software Foundation; either
7    	 * version 3 of the License, or (at your option) any later version.
8    	 *
9    	 * This program is distributed in the hope that it will be useful,
10   	 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11   	 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   	 * Lesser General Public License for more details.
13   	 *
14   	 * You should have received a copy of the GNU Lesser General Public
15   	 * License along with this library; if not, write to the Free Software
16   	 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17   	 * 02110-1301 USA
18   	 *
19   	 * ---------------------------------------
20   	 */
21   	
22   	/**
23   	 * @defgroup SAL State abstraction layer
24   	 * @{
25   	 */
26   	
27   	/**
28   	 * @file nfs4_recovery.c
29   	 * @brief NFSv4 recovery
30   	 */
31   	
32   	#include "config.h"
33   	#include "log.h"
34   	#include "nfs_core.h"
35   	#include "nfs4.h"
36   	#include "sal_functions.h"
37   	#include <sys/stat.h>
38   	#include <sys/types.h>
39   	#include <fcntl.h>
40   	#include <ctype.h>
41   	#include "bsd-base64.h"
42   	#include "client_mgr.h"
43   	#include "fsal.h"
44   	
45   	/* The grace_mutex protects current_grace, clid_list, and clid_count */
46   	static pthread_mutex_t grace_mutex = PTHREAD_MUTEX_INITIALIZER;
47   	static struct timespec current_grace; /* current grace period timeout */
48   	static int clid_count; /* number of active clients */
49   	static struct glist_head clid_list = GLIST_HEAD_INIT(clid_list);  /* clients */
50   	
51   	/*
52   	 * Low two bits of grace_status word are flags. One for whether we're currently
53   	 * in a grace period and one if a change was requested.
54   	 */
55   	#define GRACE_STATUS_ACTIVE_SHIFT	0
56   	#define GRACE_STATUS_CHANGE_REQ_SHIFT	1
57   	
58   	/* The remaining bits are for the refcount */
59   	#define GRACE_STATUS_COUNTER_SHIFT	2
60   	
61   	#define GRACE_STATUS_ACTIVE		(1U << GRACE_STATUS_ACTIVE_SHIFT)
62   	#define GRACE_STATUS_CHANGE_REQ		(1U << GRACE_STATUS_CHANGE_REQ_SHIFT)
63   	#define GRACE_STATUS_REF_INCREMENT	(1U << GRACE_STATUS_COUNTER_SHIFT)
64   	#define GRACE_STATUS_COUNT_MASK		((~0U) << GRACE_STATUS_COUNTER_SHIFT)
65   	
66   	static uint32_t	grace_status;
67   	
68   	static struct nfs4_recovery_backend *recovery_backend;
69   	int32_t reclaim_completes; /* atomic */
70   	
71   	static void nfs4_recovery_load_clids(nfs_grace_start_t *gsp);
72   	static void nfs_release_nlm_state(char *release_ip);
73   	static void nfs_release_v4_client(char *ip);
74   	
75   	clid_entry_t *nfs4_add_clid_entry(char *cl_name)
76   	{
(1) Event cond_false: Condition "p_ == NULL", taking false branch.
(2) Event if_end: End of if statement.
77   		clid_entry_t *new_ent = gsh_malloc(sizeof(clid_entry_t));
78   	
79   		glist_init(&new_ent->cl_rfh_list);
(3) Event fixed_size_dest: You might overrun the 4096-character fixed-size string "new_ent->cl_name" by copying "cl_name" without checking the length.
(4) Event parameter_as_source: Note: This defect has an elevated risk because the source argument is a parameter of the current function.
80   		strcpy(new_ent->cl_name, cl_name);
81   		glist_add(&clid_list, &new_ent->cl_list);
82   		++clid_count;
83   		return new_ent;
84   	}
85   	
86   	rdel_fh_t *nfs4_add_rfh_entry(clid_entry_t *clid_ent, char *rfh_name)
87   	{
88   		rdel_fh_t *new_ent = gsh_malloc(sizeof(rdel_fh_t));
89   	
90   		new_ent->rdfh_handle_str = gsh_strdup(rfh_name);
91   		glist_add(&clid_ent->cl_rfh_list, &new_ent->rdfh_list);
92   		return new_ent;
93   	}
94   	
95   	void nfs4_cleanup_clid_entries(void)
96   	{
97   		struct clid_entry *clid_entry;
98   		/* when not doing a takeover, start with an empty list */
99   		while ((clid_entry = glist_first_entry(&clid_list,
100  						       struct clid_entry,
101  						       cl_list)) != NULL) {
102  			glist_del(&clid_entry->cl_list);
103  			gsh_free(clid_entry);
104  			--clid_count;
105  		}
106  		assert(clid_count == 0);
107  		atomic_store_int32_t(&reclaim_completes, 0);
108  	}
109  	
110  	/*
111  	 * Check the current status of the grace period against what the caller needs.
112  	 * If it's different then return false without taking a reference. If a change
113  	 * has been requested, then we also don't want to give out a reference.
114  	 */
115  	bool nfs_get_grace_status(bool want_grace)
116  	{
117  		uint32_t cur, pro, old;
118  	
119  		old = atomic_fetch_uint32_t(&grace_status);
120  		do {
121  			cur = old;
122  	
123  			/* If it's not the state we want, then no reference */
124  			if (want_grace != (bool)(cur & GRACE_STATUS_ACTIVE))
125  				return false;
126  	
127  			/* If a change was requested, no reference */
128  			if (cur & GRACE_STATUS_CHANGE_REQ)
129  				return false;
130  	
131  			/* Bump the counter */
132  			pro = cur + GRACE_STATUS_REF_INCREMENT;
133  			old = __sync_val_compare_and_swap(&grace_status, cur, pro);
134  		} while (old != cur);
135  		return true;
136  	}
137  	
138  	/*
139  	 * Put grace status. If the refcount goes to zero, and a change was requested,
140  	 * then wake the reaper thread to do its thing.
141  	 */
142  	void nfs_put_grace_status(void)
143  	{
144  		uint32_t cur;
145  	
146  		cur = __sync_fetch_and_sub(&grace_status, GRACE_STATUS_REF_INCREMENT);
147  		if (cur & GRACE_STATUS_CHANGE_REQ &&
148  		    !(cur >> GRACE_STATUS_COUNTER_SHIFT))
149  			reaper_wake();
150  	}
151  	
152  	/**
153  	 * Lift the grace period if it's still active.
154  	 */
155  	static void
156  	nfs_lift_grace_locked(void)
157  	{
158  		uint32_t cur;
159  	
160  		/*
161  		 * Caller must hold grace_mutex. Only the thread that actually sets
162  		 * the value to 0 gets to clean up the recovery db.
163  		 */
164  		if (nfs_in_grace()) {
165  			nfs_end_grace();
166  			__sync_synchronize();
167  			/* Now change the actual status */
168  			cur = __sync_and_and_fetch(&grace_status,
169  				~(GRACE_STATUS_ACTIVE|GRACE_STATUS_CHANGE_REQ));
170  			assert(!(cur & GRACE_STATUS_COUNT_MASK));
171  			LogEvent(COMPONENT_STATE, "NFS Server Now NOT IN GRACE");
172  		}
173  	}
174  	
175  	/*
176  	 * Report our new state to the cluster
177  	 */
178  	static void nfs4_set_enforcing(void)
179  	{
180  		if (recovery_backend->set_enforcing)
181  			recovery_backend->set_enforcing();
182  	}
183  	
184  	/**
185  	 * @brief Start grace period
186  	 *
187  	 * This routine can be called due to server start/restart or from
188  	 * failover code.  If this node is taking over for a node, that nodeid
189  	 * will be passed to this routine inside of the grace start structure.
190  	 *
191  	 * @param[in] gsp Grace period start information
192  	 */
193  	void nfs_start_grace(nfs_grace_start_t *gsp)
194  	{
195  		int ret;
196  		bool was_grace;
197  		uint32_t cur, old, pro;
198  	
199  		PTHREAD_MUTEX_lock(&grace_mutex);
200  	
201  		if (nfs_param.nfsv4_param.graceless) {
202  			nfs_lift_grace_locked();
203  			LogEvent(COMPONENT_STATE,
204  				 "NFS Server skipping GRACE (Graceless is true)");
205  			goto out;
206  		}
207  	
208  		/* grace should always be greater than or equal to lease time,
209  		 * some clients are known to have problems with grace greater than 60
210  		 * seconds Lease_Lifetime should be set to a smaller value for those
211  		 * setups.
212  		 *
213  		 * Checks against the grace period are lockless, so we want to ensure
214  		 * that the callers see the
215  		 * Full barrier to ensure enforcement begins ASAP.
216  		 */
217  	
218  		/*
219  		 * Ensure there are no outstanding references to the current state of
220  		 * grace. If there are, set flag indicating that a change has been
221  		 * requested and that no more references will be handed out until it
222  		 * takes effect.
223  		 */
224  		ret = clock_gettime(CLOCK_MONOTONIC, &current_grace);
225  		if (ret != 0) {
226  			LogCrit(COMPONENT_MAIN, "Failed to get timestamp");
227  			assert(0);	/* if this is broken, we are toast so die */
228  		}
229  	
230  		cur = atomic_fetch_uint32_t(&grace_status);
231  		do {
232  			old = cur;
233  			was_grace = cur & GRACE_STATUS_ACTIVE;
234  	
235  			/* If we're already in a grace period then we're done */
236  			if (was_grace)
237  				break;
238  	
239  			/*
240  			 * Are there outstanding refs? If so, then set the change req
241  			 * flag and nothing else. If not, then clear the change req
242  			 * flag and flip the active bit.
243  			 */
244  			if (old & GRACE_STATUS_COUNT_MASK) {
245  				pro = old | GRACE_STATUS_CHANGE_REQ;
246  			} else {
247  				pro = old | GRACE_STATUS_ACTIVE;
248  				pro &= ~GRACE_STATUS_CHANGE_REQ;
249  			}
250  	
251  			/* If there are no changes, then we don't need to update */
252  			if (pro == old)
253  				break;
254  			cur = __sync_val_compare_and_swap(&grace_status, old, pro);
255  		} while (cur != old);
256  	
257  		/*
258  		 * If we were not in a grace period before and there were still
259  		 * references outstanding, then we can't do anything else.
260  		 */
261  		if (!was_grace && (old & GRACE_STATUS_COUNT_MASK))
262  			goto out;
263  	
264  		__sync_synchronize();
265  	
266  		if ((int)nfs_param.nfsv4_param.grace_period <
267  			(int)nfs_param.nfsv4_param.lease_lifetime) {
268  			LogWarn(COMPONENT_STATE,
269  			 "NFS Server GRACE duration should at least match LEASE period. Current configured values are GRACE(%d), LEASE(%d)",
270  			 (int)nfs_param.nfsv4_param.grace_period,
271  			 (int)nfs_param.nfsv4_param.lease_lifetime);
272  		}
273  	
274  		LogEvent(COMPONENT_STATE, "NFS Server Now IN GRACE, duration %d",
275  			 (int)nfs_param.nfsv4_param.grace_period);
276  	
277  		/* Set enforcing flag here */
278  		if (!was_grace)
279  			nfs4_set_enforcing();
280  	
281  		/*
282  		 * If we're just starting the grace period, then load the
283  		 * clid database. Don't load it however if we're extending the
284  		 * existing grace period.
285  		 */
286  		if (!gsp && !was_grace) {
287  			nfs4_recovery_load_clids(NULL);
288  		} else if (gsp && gsp->event != EVENT_JUST_GRACE) {
289  			/*
290  			 * if called from failover code and given a nodeid, then this
291  			 * node is doing a take over.  read in the client ids from the
292  			 * failing node.
293  			 */
294  			LogEvent(COMPONENT_STATE,
295  				 "NFS Server recovery event %d nodeid %d ip %s",
296  				 gsp->event, gsp->nodeid, gsp->ipaddr);
297  	
298  			if (gsp->event == EVENT_CLEAR_BLOCKED)
299  				cancel_all_nlm_blocked();
300  			else {
301  				nfs_release_nlm_state(gsp->ipaddr);
302  				if (gsp->event == EVENT_RELEASE_IP) {
303  					PTHREAD_MUTEX_unlock(&grace_mutex);
304  					nfs_release_v4_client(gsp->ipaddr);
305  					return;
306  				}
307  				else {
308  					nfs4_recovery_load_clids(gsp);
309  				}
310  			}
311  		}
312  	out:
313  		PTHREAD_MUTEX_unlock(&grace_mutex);
314  	}
315  	
316  	/**
317  	 * @brief Check if we are in the grace period
318  	 *
319  	 * @retval true if so.
320  	 * @retval false if not.
321  	 */
322  	bool nfs_in_grace(void)
323  	{
324  		return atomic_fetch_uint32_t(&grace_status) & GRACE_STATUS_ACTIVE;
325  	}
326  	
327  	/**
328  	 * @brief Enter the grace period if another node in the cluster needs it
329  	 *
330  	 * Singleton servers generally won't use this operation. Clustered servers
331  	 * call this function to check whether another node might need a grace period.
332  	 */
333  	void nfs_maybe_start_grace(void)
334  	{
335  		if (!nfs_in_grace() && recovery_backend->maybe_start_grace)
336  			recovery_backend->maybe_start_grace();
337  	}
338  	
339  	/**
340  	 * @brief Are all hosts in cluster enforcing the grace period?
341  	 *
342  	 * Singleton servers always return true here since the only grace period that
343  	 * matters is the local one. Clustered backends should check to make sure that
344  	 * the whole cluster is in grace.
345  	 */
346  	bool nfs_grace_enforcing(void)
347  	{
348  		if (recovery_backend->grace_enforcing)
349  			return recovery_backend->grace_enforcing();
350  		return true;
351  	}
352  	
353  	/**
354  	 * @brief Is this host still a member of the cluster?
355  	 *
356  	 * Singleton servers are always considered to be cluster members. This call
357  	 * is mainly for clustered servers, which may need to handle things differently
358  	 * on a clean shutdown depending on whether they are still a member of the
359  	 * cluster.
360  	 */
361  	bool nfs_grace_is_member(void)
362  	{
363  		if (recovery_backend->is_member)
364  			return recovery_backend->is_member();
365  		return true;
366  	}
367  	
368  	/**
369  	 * @brief Return nodeid for the server
370  	 *
371  	 * If the recovery backend specifies a nodeid, return it. If it does not
372  	 * specify one, default to using the server's hostname.
373  	 *
374  	 * Returns 0 on success and fills out pnodeid. Caller must free the returned
375  	 * value with gsh_free. Returns negative POSIX error code on error.
376  	 */
377  	int nfs_recovery_get_nodeid(char **pnodeid)
378  	{
379  		int rc;
380  		long maxlen;
381  		char *nodeid = NULL;
382  	
383  		if (recovery_backend->get_nodeid) {
384  			rc = recovery_backend->get_nodeid(&nodeid);
385  	
386  			/* Return error if we got one */
387  			if (rc)
388  				return rc;
389  	
390  			/* If we got a nodeid, then we're done */
391  			if (nodeid) {
392  				*pnodeid = nodeid;
393  				return 0;
394  			}
395  		}
396  	
397  		/*
398  		 * Either the backend doesn't support get_nodeid or it handed back a
399  		 * NULL pointer. Just use hostname.
400  		 */
401  		maxlen = sysconf(_SC_HOST_NAME_MAX);
402  		nodeid = gsh_malloc(maxlen);
403  		rc = gethostname(nodeid, maxlen);
404  		if (rc != 0) {
405  			LogEvent(COMPONENT_CLIENTID, "gethostname failed: %d", errno);
406  			rc = -errno;
407  			gsh_free(nodeid);
408  		} else {
409  			*pnodeid = nodeid;
410  		}
411  		return rc;
412  	}
413  	
414  	void nfs_try_lift_grace(void)
415  	{
416  		bool in_grace = true;
417  		int32_t rc_count = 0;
418  		uint32_t cur, old, pro;
419  	
420  		/* Already lifted? Just return */
421  		if (!(atomic_fetch_uint32_t(&grace_status) & GRACE_STATUS_ACTIVE))
422  			return;
423  	
424  		/*
425  		 * If we know there are no NLM clients, then we can consider the grace
426  		 * period done when all previous clients have sent a RECLAIM_COMPLETE.
427  		 */
428  		PTHREAD_MUTEX_lock(&grace_mutex);
429  		rc_count = atomic_fetch_int32_t(&reclaim_completes);
430  		if (!nfs_param.core_param.enable_NLM)
431  			in_grace = (rc_count != clid_count);
432  	
433  		/* Otherwise, wait for the timeout */
434  		if (in_grace) {
435  			struct timespec timeout, now;
436  			int ret = clock_gettime(CLOCK_MONOTONIC, &now);
437  	
438  			if (ret != 0) {
439  				LogCrit(COMPONENT_MAIN, "Failed to get timestamp");
440  				assert(0);
441  			}
442  	
443  			timeout = current_grace;
444  			timeout.tv_sec += nfs_param.nfsv4_param.grace_period;
445  			in_grace = gsh_time_cmp(&timeout, &now) > 0;
446  		}
447  	
448  		/*
449  		 * Ok, we're basically ready to lift. Ensure there are no outstanding
450  		 * references to the current status of the grace period. If there are,
451  		 * then set the flag saying that there is an upcoming change.
452  		 */
453  	
454  		/*
455  		 * Can we lift the grace period now? If there are any outstanding refs,
456  		 * then just set the grace_change_req flag to indicate that we don't
457  		 * want to hand any more refs out. Otherwise, we try to lift.
458  		 *
459  		 * Clustered backends may need extra checks before they can do so. If
460  		 * the backend does not implement a try_lift_grace operation, then we
461  		 * assume there are no external conditions and that it's always ok.
462  		 */
463  		if (!in_grace) {
464  			cur = atomic_fetch_uint32_t(&grace_status);
465  			do {
466  				old = cur;
467  	
468  				/* Are we already done? Exit if so */
469  				if (!(cur & GRACE_STATUS_ACTIVE)) {
470  					PTHREAD_MUTEX_unlock(&grace_mutex);
471  					return;
472  				}
473  	
474  				/* Record that a change has now been requested */
475  				pro = old | GRACE_STATUS_CHANGE_REQ;
476  				if (pro == old)
477  					break;
478  				cur = __sync_val_compare_and_swap(&grace_status,
479  								  old, pro);
480  			} while (cur != old);
481  	
482  			/* Otherwise, go ahead and lift if we can */
483  			if (!(old & GRACE_STATUS_COUNT_MASK) &&
484  			    (!recovery_backend->try_lift_grace ||
485  			     recovery_backend->try_lift_grace()))
486  				nfs_lift_grace_locked();
487  		}
488  		PTHREAD_MUTEX_unlock(&grace_mutex);
489  	}
490  	
491  	static pthread_cond_t enforcing_cond = PTHREAD_COND_INITIALIZER;
492  	static pthread_mutex_t enforcing_mutex = PTHREAD_MUTEX_INITIALIZER;
493  	
494  	/* Poll every 5s, just in case we miss the wakeup for some reason */
495  	void nfs_wait_for_grace_enforcement(void)
496  	{
497  		nfs_grace_start_t gsp = { .event = EVENT_JUST_GRACE };
498  	
499  		pthread_mutex_lock(&enforcing_mutex);
500  		nfs_try_lift_grace();
501  		while (nfs_in_grace() && !nfs_grace_enforcing()) {
502  			struct timespec	timeo = { .tv_sec = time(NULL) + 5,
503  						  .tv_nsec = 0 };
504  	
505  			pthread_cond_timedwait(&enforcing_cond, &enforcing_mutex,
506  							&timeo);
507  	
508  			pthread_mutex_unlock(&enforcing_mutex);
509  			nfs_start_grace(&gsp);
510  			nfs_try_lift_grace();
511  			pthread_mutex_lock(&enforcing_mutex);
512  		}
513  		pthread_mutex_unlock(&enforcing_mutex);
514  	}
515  	
516  	void nfs_notify_grace_waiters(void)
517  	{
518  		pthread_mutex_lock(&enforcing_mutex);
519  		pthread_cond_broadcast(&enforcing_cond);
520  		pthread_mutex_unlock(&enforcing_mutex);
521  	}
522  	
523  	/**
524  	 * @brief Create an entry in the recovery directory
525  	 *
526  	 * This entry allows the client to reclaim state after a server
527  	 * reboot/restart.
528  	 *
529  	 * @param[in] clientid Client record
530  	 */
531  	void nfs4_add_clid(nfs_client_id_t *clientid)
532  	{
533  		PTHREAD_MUTEX_lock(&clientid->cid_mutex);
534  		recovery_backend->add_clid(clientid);
535  		PTHREAD_MUTEX_unlock(&clientid->cid_mutex);
536  	}
537  	
538  	/**
539  	 * @brief Remove a client entry from the recovery directory
540  	 *
541  	 * This function would be called when a client expires.
542  	 *
543  	 */
544  	void nfs4_rm_clid(nfs_client_id_t *clientid)
545  	{
546  		PTHREAD_MUTEX_lock(&clientid->cid_mutex);
547  		recovery_backend->rm_clid(clientid);
548  		PTHREAD_MUTEX_unlock(&clientid->cid_mutex);
549  	}
550  	
551  	static bool check_clid(nfs_client_id_t *clientid, clid_entry_t *clid_ent)
552  	{
553  		bool ret = false;
554  	
555  	
556  		LogDebug(COMPONENT_CLIENTID, "compare %s to %s",
557  			 clientid->cid_recov_tag, clid_ent->cl_name);
558  	
559  		if (clientid->cid_recov_tag &&
560  		    !strncmp(clientid->cid_recov_tag,
561  			     clid_ent->cl_name, PATH_MAX))
562  			ret = true;
563  	
564  		return ret;
565  	}
566  	
567  	/**
568  	 * @brief Determine whether or not this client may reclaim state
569  	 *
570  	 * If the server is not in grace period, then no reclaim can happen.
571  	 *
572  	 * @param[in] clientid Client record
573  	 */
574  	void  nfs4_chk_clid_impl(nfs_client_id_t *clientid, clid_entry_t **clid_ent_arg)
575  	{
576  		struct glist_head *node;
577  		clid_entry_t *clid_ent;
578  		*clid_ent_arg = NULL;
579  	
580  		LogDebug(COMPONENT_CLIENTID, "chk for %lu",
581  			 clientid->cid_clientid);
582  	
583  		/* If there were no clients at time of restart, we're done */
584  		if (clid_count == 0)
585  			return;
586  	
587  		/*
588  		 * loop through the list and try to find this client. If we
589  		 * find it, mark it to allow reclaims.
590  		 */
591  		PTHREAD_MUTEX_lock(&clientid->cid_mutex);
592  		glist_for_each(node, &clid_list) {
593  			clid_ent = glist_entry(node, clid_entry_t, cl_list);
594  			if (check_clid(clientid, clid_ent)) {
595  				if (isDebug(COMPONENT_CLIENTID)) {
596  					char str[LOG_BUFF_LEN] = "\0";
597  					struct display_buffer dspbuf = {
598  						sizeof(str), str, str};
599  	
600  					display_client_id_rec(&dspbuf, clientid);
601  	
602  					LogFullDebug(COMPONENT_CLIENTID,
603  						     "Allowed to reclaim ClientId %s",
604  						     str);
605  				}
606  				clientid->cid_allow_reclaim = true;
607  				*clid_ent_arg = clid_ent;
608  				break;
609  			}
610  		}
611  		PTHREAD_MUTEX_unlock(&clientid->cid_mutex);
612  	}
613  	
614  	void  nfs4_chk_clid(nfs_client_id_t *clientid)
615  	{
616  		clid_entry_t *dummy_clid_ent;
617  	
618  		PTHREAD_MUTEX_lock(&grace_mutex);
619  		nfs4_chk_clid_impl(clientid, &dummy_clid_ent);
620  		PTHREAD_MUTEX_unlock(&grace_mutex);
621  	}
622  	
623  	/**
624  	 * @brief Load clients for recovery
625  	 *
626  	 * @param[in] nodeid Node, on takeover
627  	 *
628  	 * Caller must hold grace_mutex.
629  	 */
630  	static void nfs4_recovery_load_clids(nfs_grace_start_t *gsp)
631  	{
632  		LogDebug(COMPONENT_STATE, "Load recovery cli %p", gsp);
633  	
634  		/* A NULL gsp pointer indicates an initial startup grace period */
635  		if (gsp == NULL)
636  			nfs4_cleanup_clid_entries();
637  		recovery_backend->recovery_read_clids(gsp, nfs4_add_clid_entry,
638  							nfs4_add_rfh_entry);
639  	}
640  	
641  	static int load_backend(const char *name)
642  	{
643  		if (!strcmp(name, "fs"))
644  			fs_backend_init(&recovery_backend);
645  	#ifdef USE_RADOS_RECOV
646  		else if (!strcmp(name, "rados_kv"))
647  			rados_kv_backend_init(&recovery_backend);
648  		else if (!strcmp(name, "rados_ng"))
649  			rados_ng_backend_init(&recovery_backend);
650  		else if (!strcmp(name, "rados_cluster"))
651  			rados_cluster_backend_init(&recovery_backend);
652  	#endif
653  		else if (!strcmp(name, "fs_ng"))
654  			fs_ng_backend_init(&recovery_backend);
655  		else
656  			return -1;
657  		return 0;
658  	}
659  	
660  	/**
661  	 * @brief Create the recovery directory
662  	 *
663  	 * The recovery directory may not exist yet, so create it.  This
664  	 * should only need to be done once (if at all).  Also, the location
665  	 * of the directory could be configurable.
666  	 */
667  	int nfs4_recovery_init(void)
668  	{
669  		if (load_backend(nfs_param.nfsv4_param.recovery_backend)) {
670  			LogCrit(COMPONENT_CLIENTID, "Unknown recovery backend");
671  			return -ENOENT;
672  		}
673  		return recovery_backend->recovery_init();
674  	}
675  	
676  	/**
677  	 * @brief Shut down the recovery backend
678  	 *
679  	 * Shut down the recovery backend, cleaning up any clients or tracking
680  	 * structures in preparation for server shutdown.
681  	 */
682  	void nfs4_recovery_shutdown(void)
683  	{
684  		if (recovery_backend->recovery_shutdown)
685  			recovery_backend->recovery_shutdown();
686  	}
687  	
688  	/**
689  	 * @brief Clean up recovery directory
690  	 */
691  	void nfs_end_grace(void)
692  	{
693  		recovery_backend->end_grace();
694  	}
695  	
696  	/**
697  	 * @brief Record revoked filehandle under the client.
698  	 *
699  	 * @param[in] clientid Client record
700  	 * @param[in] filehandle of the revoked file.
701  	 */
702  	void nfs4_record_revoke(nfs_client_id_t *delr_clid, nfs_fh4 *delr_handle)
703  	{
704  		/* A client's lease is reserved while recalling or revoking a
705  		 * delegation which means the client will not expire until we
706  		 * complete this revoke operation. The only exception is when
707  		 * the reaper thread revokes delegations of an already expired
708  		 * client!
709  		 */
710  		PTHREAD_MUTEX_lock(&delr_clid->cid_mutex);
711  		if (delr_clid->cid_confirmed == EXPIRED_CLIENT_ID) {
712  			/* Called from reaper thread, no need to record
713  			 * revoked file handles for an expired client.
714  			 */
715  			PTHREAD_MUTEX_unlock(&delr_clid->cid_mutex);
716  			return;
717  		}
718  		recovery_backend->add_revoke_fh(delr_clid, delr_handle);
719  		PTHREAD_MUTEX_unlock(&delr_clid->cid_mutex);
720  	}
721  	
722  	/**
723  	 * @brief Decides if it is allowed to reclaim a given delegation
724  	 *
725  	 * @param[in] clientid Client record
726  	 * @param[in] filehandle of the revoked file.
727  	 * @retval true if allowed and false if not.
728  	 *
729  	 */
730  	bool nfs4_check_deleg_reclaim(nfs_client_id_t *clid, nfs_fh4 *fhandle)
731  	{
732  		char rhdlstr[NAME_MAX];
733  		struct glist_head *node;
734  		rdel_fh_t *rfh_entry;
735  		clid_entry_t *clid_ent;
736  		int b64ret;
737  		bool retval = true;
738  	
739  		/* Convert nfs_fh4_val into base64 encoded string */
740  		b64ret = base64url_encode(fhandle->nfs_fh4_val, fhandle->nfs_fh4_len,
741  					  rhdlstr, sizeof(rhdlstr));
742  		assert(b64ret != -1);
743  	
744  		PTHREAD_MUTEX_lock(&grace_mutex);
745  		nfs4_chk_clid_impl(clid, &clid_ent);
746  		if (clid_ent) {
747  			glist_for_each(node, &clid_ent->cl_rfh_list) {
748  				rfh_entry = glist_entry(node, rdel_fh_t, rdfh_list);
749  				assert(rfh_entry != NULL);
750  				if (!strcmp(rhdlstr, rfh_entry->rdfh_handle_str)) {
751  					LogFullDebug(COMPONENT_CLIENTID,
752  						"Can't reclaim revoked fh:%s",
753  						rfh_entry->rdfh_handle_str);
754  					retval = false;
755  					break;
756  				}
757  			}
758  		}
759  		PTHREAD_MUTEX_unlock(&grace_mutex);
760  		LogFullDebug(COMPONENT_CLIENTID, "Returning %s",
761  			     retval ? "TRUE" : "FALSE");
762  		return retval;
763  	}
764  	
765  	#ifdef _USE_NLM
766  	/**
767  	 * @brief Release NLM state
768  	 */
769  	static void nlm_releasecall(struct fridgethr_context *ctx)
770  	{
771  		state_nsm_client_t *nsm_cp;
772  		state_status_t err;
773  	
774  		nsm_cp = ctx->arg;
775  		err = state_nlm_notify(nsm_cp, false, 0);
776  		if (err != STATE_SUCCESS)
777  			LogDebug(COMPONENT_STATE,
778  				"state_nlm_notify failed with %d",
779  				err);
780  		dec_nsm_client_ref(nsm_cp);
781  	}
782  	#endif /* _USE_NLM */
783  	
784  	void extractv4(char *ipv6, char *ipv4)
785  	{
786  		char *token, *saveptr;
787  		char *delim = ":";
788  	
789  		token = strtok_r(ipv6, delim, &saveptr);
790  		while (token != NULL) {
791  			/* IPv4 delimiter is '.' */
792  			if (strchr(token, '.') != NULL) {
793  				(void)strcpy(ipv4, token);
794  				return;
795  			}
796  			token = strtok_r(NULL, delim, &saveptr);
797  		}
798  		/* failed, copy a null string */
799  		(void)strcpy(ipv4, "");
800  	}
801  	
802  	bool ip_str_match(char *release_ip, char *server_ip)
803  	{
804  		bool ripv6, sipv6;
805  		char ipv4[SOCK_NAME_MAX + 1];
806  	
807  		/* IPv6 delimiter is ':' */
808  		ripv6 = (strchr(release_ip, ':') != NULL);
809  		sipv6 = (strchr(server_ip, ':') != NULL);
810  	
811  		if (ripv6) {
812  			if (sipv6)
813  				return !strcmp(release_ip, server_ip);
814  			else {
815  				/* extract v4 addr from release_ip*/
816  				extractv4(release_ip, ipv4);
817  				return !strcmp(ipv4, server_ip);
818  			}
819  		} else {
820  			if (sipv6) {
821  				/* extract v4 addr from server_ip*/
822  				extractv4(server_ip, ipv4);
823  				return !strcmp(ipv4, release_ip);
824  			}
825  		}
826  		/* Both are ipv4 addresses */
827  		return !strcmp(release_ip, server_ip);
828  	}
829  	
830  	/**
831  	 * @brief Release all NLM state
832  	 */
833  	static void nfs_release_nlm_state(char *release_ip)
834  	{
835  	#ifdef _USE_NLM
836  		hash_table_t *ht = ht_nlm_client;
837  		state_nlm_client_t *nlm_cp;
838  		state_nsm_client_t *nsm_cp;
839  		struct rbt_head *head_rbt;
840  		struct rbt_node *pn;
841  		struct hash_data *pdata;
842  		state_status_t state_status;
843  		char serverip[SOCK_NAME_MAX + 1];
844  		int i;
845  	
846  		LogDebug(COMPONENT_STATE, "Release all NLM locks");
847  	
848  		cancel_all_nlm_blocked();
849  	
850  		/* walk the client list and call state_nlm_notify */
851  		for (i = 0; i < ht->parameter.index_size; i++) {
852  			PTHREAD_RWLOCK_wrlock(&ht->partitions[i].lock);
853  			head_rbt = &ht->partitions[i].rbt;
854  			/* go through all entries in the red-black-tree */
855  			RBT_LOOP(head_rbt, pn) {
856  				pdata = RBT_OPAQ(pn);
857  				nlm_cp = (state_nlm_client_t *) pdata->val.addr;
858  				sprint_sockip(&(nlm_cp->slc_server_addr),
859  						serverip,
860  						SOCK_NAME_MAX + 1);
861  				if (ip_str_match(release_ip, serverip)) {
862  					nsm_cp = nlm_cp->slc_nsm_client;
863  					inc_nsm_client_ref(nsm_cp);
864  					state_status = fridgethr_submit(
865  							state_async_fridge,
866  							nlm_releasecall,
867  							nsm_cp);
868  					if (state_status != STATE_SUCCESS) {
869  						dec_nsm_client_ref(nsm_cp);
870  						LogCrit(COMPONENT_STATE,
871  							"failed to submit nlm release thread ");
872  					}
873  				}
874  				RBT_INCREMENT(pn);
875  			}
876  			PTHREAD_RWLOCK_unlock(&ht->partitions[i].lock);
877  		}
878  	#endif /* _USE_NLM */
879  	}
880  	
881  	static int ip_match(char *ip, nfs_client_id_t *cid)
882  	{
883  		LogDebug(COMPONENT_STATE, "NFS Server V4 match ip %s with (%s)",
884  			 ip, cid->cid_client_record->cr_client_val);
885  	
886  		if (strlen(ip) == 0)	/* No IP all are matching */
887  			return 1;
888  	
889  		if (strstr(cid->cid_client_record->cr_client_val, ip) != NULL)
890  			return 1;
891  	
892  		return 0;		/* no match */
893  	}
894  	
895  	/*
896  	 * try to find a V4 client that matches the IP we are releasing.
897  	 * only search the confirmed clients, unconfirmed clients won't
898  	 * have any state to release.
899  	 */
900  	static void nfs_release_v4_client(char *ip)
901  	{
902  		hash_table_t *ht = ht_confirmed_client_id;
903  		struct rbt_head *head_rbt;
904  		struct rbt_node *pn;
905  		struct hash_data *pdata;
906  		nfs_client_id_t *cp;
907  		nfs_client_record_t *recp;
908  		int i;
909  	
910  		LogEvent(COMPONENT_STATE, "NFS Server V4 recovery release ip %s", ip);
911  	
912  		/* go through the confirmed clients looking for a match */
913  		for (i = 0; i < ht->parameter.index_size; i++) {
914  	
915  			PTHREAD_RWLOCK_wrlock(&ht->partitions[i].lock);
916  			head_rbt = &ht->partitions[i].rbt;
917  	
918  			/* go through all entries in the red-black-tree */
919  			RBT_LOOP(head_rbt, pn) {
920  				pdata = RBT_OPAQ(pn);
921  	
922  				cp = (nfs_client_id_t *) pdata->val.addr;
923  				PTHREAD_MUTEX_lock(&cp->cid_mutex);
924  				if ((cp->cid_confirmed == CONFIRMED_CLIENT_ID)
925  				     && ip_match(ip, cp)) {
926  					inc_client_id_ref(cp);
927  	
928  					/* Take a reference to the client record
929  					 * before we drop cid_mutex. client record
930  					 * may be decoupled, so check if it is still
931  					 * coupled!
932  					 */
933  					recp = cp->cid_client_record;
934  					if (recp)
935  						inc_client_record_ref(recp);
936  	
937  					PTHREAD_MUTEX_unlock(&cp->cid_mutex);
938  	
939  					PTHREAD_RWLOCK_unlock(&ht->partitions[i].lock);
940  	
941  					/* nfs_client_id_expire requires cr_mutex
942  					 * if not decoupled alread
943  					 */
944  					if (recp)
945  						PTHREAD_MUTEX_lock(&recp->cr_mutex);
946  	
947  					nfs_client_id_expire(cp, true);
948  	
949  					if (recp) {
950  						PTHREAD_MUTEX_unlock(&recp->cr_mutex);
951  						dec_client_record_ref(recp);
952  					}
953  	
954  					dec_client_id_ref(cp);
955  					return;
956  	
957  				} else {
958  					PTHREAD_MUTEX_unlock(&cp->cid_mutex);
959  				}
960  				RBT_INCREMENT(pn);
961  			}
962  			PTHREAD_RWLOCK_unlock(&ht->partitions[i].lock);
963  		}
964  	}
965  	
966  	/** @} */
967