16714d8e8SKurt Hackel /* -*- mode: c; c-basic-offset: 8; -*- 26714d8e8SKurt Hackel * vim: noexpandtab sw=8 ts=8 sts=0: 36714d8e8SKurt Hackel * 46714d8e8SKurt Hackel * dlmmod.c 56714d8e8SKurt Hackel * 66714d8e8SKurt Hackel * standalone DLM module 76714d8e8SKurt Hackel * 86714d8e8SKurt Hackel * Copyright (C) 2004 Oracle. All rights reserved. 96714d8e8SKurt Hackel * 106714d8e8SKurt Hackel * This program is free software; you can redistribute it and/or 116714d8e8SKurt Hackel * modify it under the terms of the GNU General Public 126714d8e8SKurt Hackel * License as published by the Free Software Foundation; either 136714d8e8SKurt Hackel * version 2 of the License, or (at your option) any later version. 146714d8e8SKurt Hackel * 156714d8e8SKurt Hackel * This program is distributed in the hope that it will be useful, 166714d8e8SKurt Hackel * but WITHOUT ANY WARRANTY; without even the implied warranty of 176714d8e8SKurt Hackel * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 186714d8e8SKurt Hackel * General Public License for more details. 196714d8e8SKurt Hackel * 206714d8e8SKurt Hackel * You should have received a copy of the GNU General Public 216714d8e8SKurt Hackel * License along with this program; if not, write to the 226714d8e8SKurt Hackel * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 236714d8e8SKurt Hackel * Boston, MA 021110-1307, USA. 246714d8e8SKurt Hackel * 256714d8e8SKurt Hackel */ 266714d8e8SKurt Hackel 276714d8e8SKurt Hackel 286714d8e8SKurt Hackel #include <linux/module.h> 296714d8e8SKurt Hackel #include <linux/fs.h> 306714d8e8SKurt Hackel #include <linux/types.h> 316714d8e8SKurt Hackel #include <linux/slab.h> 326714d8e8SKurt Hackel #include <linux/highmem.h> 336714d8e8SKurt Hackel #include <linux/utsname.h> 346714d8e8SKurt Hackel #include <linux/init.h> 356714d8e8SKurt Hackel #include <linux/sysctl.h> 366714d8e8SKurt Hackel #include <linux/random.h> 376714d8e8SKurt Hackel #include <linux/blkdev.h> 386714d8e8SKurt Hackel #include <linux/socket.h> 396714d8e8SKurt Hackel #include <linux/inet.h> 406714d8e8SKurt Hackel #include <linux/spinlock.h> 416714d8e8SKurt Hackel #include <linux/delay.h> 426714d8e8SKurt Hackel 436714d8e8SKurt Hackel 446714d8e8SKurt Hackel #include "cluster/heartbeat.h" 456714d8e8SKurt Hackel #include "cluster/nodemanager.h" 466714d8e8SKurt Hackel #include "cluster/tcp.h" 476714d8e8SKurt Hackel 486714d8e8SKurt Hackel #include "dlmapi.h" 496714d8e8SKurt Hackel #include "dlmcommon.h" 5082353b59SAdrian Bunk #include "dlmdomain.h" 51e5a0334cSSunil Mushran #include "dlmdebug.h" 526714d8e8SKurt Hackel 536714d8e8SKurt Hackel #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) 546714d8e8SKurt Hackel #include "cluster/masklog.h" 556714d8e8SKurt Hackel 566714d8e8SKurt Hackel static void dlm_mle_node_down(struct dlm_ctxt *dlm, 576714d8e8SKurt Hackel struct dlm_master_list_entry *mle, 586714d8e8SKurt Hackel struct o2nm_node *node, 596714d8e8SKurt Hackel int idx); 606714d8e8SKurt Hackel static void dlm_mle_node_up(struct dlm_ctxt *dlm, 616714d8e8SKurt Hackel struct dlm_master_list_entry *mle, 626714d8e8SKurt Hackel struct o2nm_node *node, 636714d8e8SKurt Hackel int idx); 646714d8e8SKurt Hackel 656714d8e8SKurt Hackel static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); 66ba2bf218SKurt Hackel static int dlm_do_assert_master(struct dlm_ctxt *dlm, 67ba2bf218SKurt Hackel struct dlm_lock_resource *res, 68ba2bf218SKurt Hackel void *nodemap, u32 flags); 69f3f85464SSunil Mushran static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data); 706714d8e8SKurt Hackel 71f77a9a78SSunil Mushran static inline void __dlm_mle_name(struct dlm_master_list_entry *mle, 722ed6c750SSunil Mushran unsigned char **name, unsigned int *namelen, 732ed6c750SSunil Mushran unsigned int *namehash) 74f77a9a78SSunil Mushran { 75f77a9a78SSunil Mushran BUG_ON(mle->type != DLM_MLE_BLOCK && 76f77a9a78SSunil Mushran mle->type != DLM_MLE_MASTER && 77f77a9a78SSunil Mushran mle->type != DLM_MLE_MIGRATION); 78f77a9a78SSunil Mushran 79f77a9a78SSunil Mushran if (mle->type != DLM_MLE_MASTER) { 80f77a9a78SSunil Mushran *name = mle->u.mlename.name; 81f77a9a78SSunil Mushran *namelen = mle->u.mlename.len; 822ed6c750SSunil Mushran if (namehash) 832ed6c750SSunil Mushran *namehash = mle->u.mlename.hash; 84f77a9a78SSunil Mushran } else { 85f77a9a78SSunil Mushran *name = (unsigned char *)mle->u.mleres->lockname.name; 86f77a9a78SSunil Mushran *namelen = mle->u.mleres->lockname.len; 872ed6c750SSunil Mushran if (namehash) 882ed6c750SSunil Mushran *namehash = mle->u.mleres->lockname.hash; 89f77a9a78SSunil Mushran } 90f77a9a78SSunil Mushran } 91f77a9a78SSunil Mushran 926714d8e8SKurt Hackel static inline int dlm_mle_equal(struct dlm_ctxt *dlm, 936714d8e8SKurt Hackel struct dlm_master_list_entry *mle, 946714d8e8SKurt Hackel const char *name, 956714d8e8SKurt Hackel unsigned int namelen) 966714d8e8SKurt Hackel { 97f77a9a78SSunil Mushran unsigned char *mlename; 98f77a9a78SSunil Mushran unsigned int mlelen; 996714d8e8SKurt Hackel 1006714d8e8SKurt Hackel if (dlm != mle->dlm) 1016714d8e8SKurt Hackel return 0; 1026714d8e8SKurt Hackel 1032ed6c750SSunil Mushran __dlm_mle_name(mle, &mlename, &mlelen, NULL); 104f77a9a78SSunil Mushran 105f77a9a78SSunil Mushran if (namelen != mlelen || memcmp(name, mlename, namelen) != 0) 1066714d8e8SKurt Hackel return 0; 107f77a9a78SSunil Mushran 1086714d8e8SKurt Hackel return 1; 1096714d8e8SKurt Hackel } 1106714d8e8SKurt Hackel 111724bdca9SSunil Mushran static struct kmem_cache *dlm_lockres_cache = NULL; 112724bdca9SSunil Mushran static struct kmem_cache *dlm_lockname_cache = NULL; 113e18b890bSChristoph Lameter static struct kmem_cache *dlm_mle_cache = NULL; 1146714d8e8SKurt Hackel 1156714d8e8SKurt Hackel static void dlm_mle_release(struct kref *kref); 1166714d8e8SKurt Hackel static void dlm_init_mle(struct dlm_master_list_entry *mle, 1176714d8e8SKurt Hackel enum dlm_mle_type type, 1186714d8e8SKurt Hackel struct dlm_ctxt *dlm, 1196714d8e8SKurt Hackel struct dlm_lock_resource *res, 1206714d8e8SKurt Hackel const char *name, 1216714d8e8SKurt Hackel unsigned int namelen); 1226714d8e8SKurt Hackel static void dlm_put_mle(struct dlm_master_list_entry *mle); 1236714d8e8SKurt Hackel static void __dlm_put_mle(struct dlm_master_list_entry *mle); 1246714d8e8SKurt Hackel static int dlm_find_mle(struct dlm_ctxt *dlm, 1256714d8e8SKurt Hackel struct dlm_master_list_entry **mle, 1266714d8e8SKurt Hackel char *name, unsigned int namelen); 1276714d8e8SKurt Hackel 128ba2bf218SKurt Hackel static int dlm_do_master_request(struct dlm_lock_resource *res, 129ba2bf218SKurt Hackel struct dlm_master_list_entry *mle, int to); 1306714d8e8SKurt Hackel 1316714d8e8SKurt Hackel 1326714d8e8SKurt Hackel static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, 1336714d8e8SKurt Hackel struct dlm_lock_resource *res, 1346714d8e8SKurt Hackel struct dlm_master_list_entry *mle, 1356714d8e8SKurt Hackel int *blocked); 1366714d8e8SKurt Hackel static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, 1376714d8e8SKurt Hackel struct dlm_lock_resource *res, 1386714d8e8SKurt Hackel struct dlm_master_list_entry *mle, 1396714d8e8SKurt Hackel int blocked); 1406714d8e8SKurt Hackel static int dlm_add_migration_mle(struct dlm_ctxt *dlm, 1416714d8e8SKurt Hackel struct dlm_lock_resource *res, 1426714d8e8SKurt Hackel struct dlm_master_list_entry *mle, 1436714d8e8SKurt Hackel struct dlm_master_list_entry **oldmle, 1446714d8e8SKurt Hackel const char *name, unsigned int namelen, 1456714d8e8SKurt Hackel u8 new_master, u8 master); 1466714d8e8SKurt Hackel 1476714d8e8SKurt Hackel static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, 1486714d8e8SKurt Hackel struct dlm_lock_resource *res); 1496714d8e8SKurt Hackel static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, 1506714d8e8SKurt Hackel struct dlm_lock_resource *res); 1516714d8e8SKurt Hackel static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, 1526714d8e8SKurt Hackel struct dlm_lock_resource *res, 1536714d8e8SKurt Hackel u8 target); 154c03872f5SKurt Hackel static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, 155c03872f5SKurt Hackel struct dlm_lock_resource *res); 1566714d8e8SKurt Hackel 1576714d8e8SKurt Hackel 1586714d8e8SKurt Hackel int dlm_is_host_down(int errno) 1596714d8e8SKurt Hackel { 1606714d8e8SKurt Hackel switch (errno) { 1616714d8e8SKurt Hackel case -EBADF: 1626714d8e8SKurt Hackel case -ECONNREFUSED: 1636714d8e8SKurt Hackel case -ENOTCONN: 1646714d8e8SKurt Hackel case -ECONNRESET: 1656714d8e8SKurt Hackel case -EPIPE: 1666714d8e8SKurt Hackel case -EHOSTDOWN: 1676714d8e8SKurt Hackel case -EHOSTUNREACH: 1686714d8e8SKurt Hackel case -ETIMEDOUT: 1696714d8e8SKurt Hackel case -ECONNABORTED: 1706714d8e8SKurt Hackel case -ENETDOWN: 1716714d8e8SKurt Hackel case -ENETUNREACH: 1726714d8e8SKurt Hackel case -ENETRESET: 1736714d8e8SKurt Hackel case -ESHUTDOWN: 1746714d8e8SKurt Hackel case -ENOPROTOOPT: 1756714d8e8SKurt Hackel case -EINVAL: /* if returned from our tcp code, 1766714d8e8SKurt Hackel this means there is no socket */ 1776714d8e8SKurt Hackel return 1; 1786714d8e8SKurt Hackel } 1796714d8e8SKurt Hackel return 0; 1806714d8e8SKurt Hackel } 1816714d8e8SKurt Hackel 1826714d8e8SKurt Hackel 1836714d8e8SKurt Hackel /* 1846714d8e8SKurt Hackel * MASTER LIST FUNCTIONS 1856714d8e8SKurt Hackel */ 1866714d8e8SKurt Hackel 1876714d8e8SKurt Hackel 1886714d8e8SKurt Hackel /* 1896714d8e8SKurt Hackel * regarding master list entries and heartbeat callbacks: 1906714d8e8SKurt Hackel * 1916714d8e8SKurt Hackel * in order to avoid sleeping and allocation that occurs in 1926714d8e8SKurt Hackel * heartbeat, master list entries are simply attached to the 1936714d8e8SKurt Hackel * dlm's established heartbeat callbacks. the mle is attached 1946714d8e8SKurt Hackel * when it is created, and since the dlm->spinlock is held at 1956714d8e8SKurt Hackel * that time, any heartbeat event will be properly discovered 1966714d8e8SKurt Hackel * by the mle. the mle needs to be detached from the 1976714d8e8SKurt Hackel * dlm->mle_hb_events list as soon as heartbeat events are no 1986714d8e8SKurt Hackel * longer useful to the mle, and before the mle is freed. 1996714d8e8SKurt Hackel * 2006714d8e8SKurt Hackel * as a general rule, heartbeat events are no longer needed by 2016714d8e8SKurt Hackel * the mle once an "answer" regarding the lock master has been 2026714d8e8SKurt Hackel * received. 2036714d8e8SKurt Hackel */ 2046714d8e8SKurt Hackel static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm, 2056714d8e8SKurt Hackel struct dlm_master_list_entry *mle) 2066714d8e8SKurt Hackel { 2076714d8e8SKurt Hackel assert_spin_locked(&dlm->spinlock); 2086714d8e8SKurt Hackel 2096714d8e8SKurt Hackel list_add_tail(&mle->hb_events, &dlm->mle_hb_events); 2106714d8e8SKurt Hackel } 2116714d8e8SKurt Hackel 2126714d8e8SKurt Hackel 2136714d8e8SKurt Hackel static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, 2146714d8e8SKurt Hackel struct dlm_master_list_entry *mle) 2156714d8e8SKurt Hackel { 2166714d8e8SKurt Hackel if (!list_empty(&mle->hb_events)) 2176714d8e8SKurt Hackel list_del_init(&mle->hb_events); 2186714d8e8SKurt Hackel } 2196714d8e8SKurt Hackel 2206714d8e8SKurt Hackel 2216714d8e8SKurt Hackel static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, 2226714d8e8SKurt Hackel struct dlm_master_list_entry *mle) 2236714d8e8SKurt Hackel { 2246714d8e8SKurt Hackel spin_lock(&dlm->spinlock); 2256714d8e8SKurt Hackel __dlm_mle_detach_hb_events(dlm, mle); 2266714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 2276714d8e8SKurt Hackel } 2286714d8e8SKurt Hackel 229a2bf0477SKurt Hackel static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle) 230a2bf0477SKurt Hackel { 231a2bf0477SKurt Hackel struct dlm_ctxt *dlm; 232a2bf0477SKurt Hackel dlm = mle->dlm; 233a2bf0477SKurt Hackel 234a2bf0477SKurt Hackel assert_spin_locked(&dlm->spinlock); 235a2bf0477SKurt Hackel assert_spin_locked(&dlm->master_lock); 236a2bf0477SKurt Hackel mle->inuse++; 237a2bf0477SKurt Hackel kref_get(&mle->mle_refs); 238a2bf0477SKurt Hackel } 239a2bf0477SKurt Hackel 240a2bf0477SKurt Hackel static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle) 241a2bf0477SKurt Hackel { 242a2bf0477SKurt Hackel struct dlm_ctxt *dlm; 243a2bf0477SKurt Hackel dlm = mle->dlm; 244a2bf0477SKurt Hackel 245a2bf0477SKurt Hackel spin_lock(&dlm->spinlock); 246a2bf0477SKurt Hackel spin_lock(&dlm->master_lock); 247a2bf0477SKurt Hackel mle->inuse--; 248a2bf0477SKurt Hackel __dlm_put_mle(mle); 249a2bf0477SKurt Hackel spin_unlock(&dlm->master_lock); 250a2bf0477SKurt Hackel spin_unlock(&dlm->spinlock); 251a2bf0477SKurt Hackel 252a2bf0477SKurt Hackel } 253a2bf0477SKurt Hackel 2546714d8e8SKurt Hackel /* remove from list and free */ 2556714d8e8SKurt Hackel static void __dlm_put_mle(struct dlm_master_list_entry *mle) 2566714d8e8SKurt Hackel { 2576714d8e8SKurt Hackel struct dlm_ctxt *dlm; 2586714d8e8SKurt Hackel dlm = mle->dlm; 2596714d8e8SKurt Hackel 2606714d8e8SKurt Hackel assert_spin_locked(&dlm->spinlock); 2616714d8e8SKurt Hackel assert_spin_locked(&dlm->master_lock); 262aa852354SKurt Hackel if (!atomic_read(&mle->mle_refs.refcount)) { 263aa852354SKurt Hackel /* this may or may not crash, but who cares. 264aa852354SKurt Hackel * it's a BUG. */ 265aa852354SKurt Hackel mlog(ML_ERROR, "bad mle: %p\n", mle); 266aa852354SKurt Hackel dlm_print_one_mle(mle); 267aa852354SKurt Hackel BUG(); 268aa852354SKurt Hackel } else 2696714d8e8SKurt Hackel kref_put(&mle->mle_refs, dlm_mle_release); 2706714d8e8SKurt Hackel } 2716714d8e8SKurt Hackel 2726714d8e8SKurt Hackel 2736714d8e8SKurt Hackel /* must not have any spinlocks coming in */ 2746714d8e8SKurt Hackel static void dlm_put_mle(struct dlm_master_list_entry *mle) 2756714d8e8SKurt Hackel { 2766714d8e8SKurt Hackel struct dlm_ctxt *dlm; 2776714d8e8SKurt Hackel dlm = mle->dlm; 2786714d8e8SKurt Hackel 2796714d8e8SKurt Hackel spin_lock(&dlm->spinlock); 2806714d8e8SKurt Hackel spin_lock(&dlm->master_lock); 2816714d8e8SKurt Hackel __dlm_put_mle(mle); 2826714d8e8SKurt Hackel spin_unlock(&dlm->master_lock); 2836714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 2846714d8e8SKurt Hackel } 2856714d8e8SKurt Hackel 2866714d8e8SKurt Hackel static inline void dlm_get_mle(struct dlm_master_list_entry *mle) 2876714d8e8SKurt Hackel { 2886714d8e8SKurt Hackel kref_get(&mle->mle_refs); 2896714d8e8SKurt Hackel } 2906714d8e8SKurt Hackel 2916714d8e8SKurt Hackel static void dlm_init_mle(struct dlm_master_list_entry *mle, 2926714d8e8SKurt Hackel enum dlm_mle_type type, 2936714d8e8SKurt Hackel struct dlm_ctxt *dlm, 2946714d8e8SKurt Hackel struct dlm_lock_resource *res, 2956714d8e8SKurt Hackel const char *name, 2966714d8e8SKurt Hackel unsigned int namelen) 2976714d8e8SKurt Hackel { 2986714d8e8SKurt Hackel assert_spin_locked(&dlm->spinlock); 2996714d8e8SKurt Hackel 3006714d8e8SKurt Hackel mle->dlm = dlm; 3016714d8e8SKurt Hackel mle->type = type; 3022ed6c750SSunil Mushran INIT_HLIST_NODE(&mle->master_hash_node); 3036714d8e8SKurt Hackel INIT_LIST_HEAD(&mle->hb_events); 3046714d8e8SKurt Hackel memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); 3056714d8e8SKurt Hackel spin_lock_init(&mle->spinlock); 3066714d8e8SKurt Hackel init_waitqueue_head(&mle->wq); 3076714d8e8SKurt Hackel atomic_set(&mle->woken, 0); 3086714d8e8SKurt Hackel kref_init(&mle->mle_refs); 3096714d8e8SKurt Hackel memset(mle->response_map, 0, sizeof(mle->response_map)); 3106714d8e8SKurt Hackel mle->master = O2NM_MAX_NODES; 3116714d8e8SKurt Hackel mle->new_master = O2NM_MAX_NODES; 312a2bf0477SKurt Hackel mle->inuse = 0; 3136714d8e8SKurt Hackel 314f77a9a78SSunil Mushran BUG_ON(mle->type != DLM_MLE_BLOCK && 315f77a9a78SSunil Mushran mle->type != DLM_MLE_MASTER && 316f77a9a78SSunil Mushran mle->type != DLM_MLE_MIGRATION); 317f77a9a78SSunil Mushran 3186714d8e8SKurt Hackel if (mle->type == DLM_MLE_MASTER) { 3196714d8e8SKurt Hackel BUG_ON(!res); 320f77a9a78SSunil Mushran mle->u.mleres = res; 321f77a9a78SSunil Mushran } else { 3226714d8e8SKurt Hackel BUG_ON(!name); 323f77a9a78SSunil Mushran memcpy(mle->u.mlename.name, name, namelen); 324f77a9a78SSunil Mushran mle->u.mlename.len = namelen; 3252ed6c750SSunil Mushran mle->u.mlename.hash = dlm_lockid_hash(name, namelen); 3266714d8e8SKurt Hackel } 3276714d8e8SKurt Hackel 3282041d8fdSSunil Mushran atomic_inc(&dlm->mle_tot_count[mle->type]); 3292041d8fdSSunil Mushran atomic_inc(&dlm->mle_cur_count[mle->type]); 3302041d8fdSSunil Mushran 3316714d8e8SKurt Hackel /* copy off the node_map and register hb callbacks on our copy */ 3326714d8e8SKurt Hackel memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); 3336714d8e8SKurt Hackel memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); 3346714d8e8SKurt Hackel clear_bit(dlm->node_num, mle->vote_map); 3356714d8e8SKurt Hackel clear_bit(dlm->node_num, mle->node_map); 3366714d8e8SKurt Hackel 3376714d8e8SKurt Hackel /* attach the mle to the domain node up/down events */ 3386714d8e8SKurt Hackel __dlm_mle_attach_hb_events(dlm, mle); 3396714d8e8SKurt Hackel } 3406714d8e8SKurt Hackel 3411c084577SSunil Mushran void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) 3421c084577SSunil Mushran { 3431c084577SSunil Mushran assert_spin_locked(&dlm->spinlock); 3441c084577SSunil Mushran assert_spin_locked(&dlm->master_lock); 3451c084577SSunil Mushran 3462ed6c750SSunil Mushran if (!hlist_unhashed(&mle->master_hash_node)) 3472ed6c750SSunil Mushran hlist_del_init(&mle->master_hash_node); 3481c084577SSunil Mushran } 3491c084577SSunil Mushran 3501c084577SSunil Mushran void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) 3511c084577SSunil Mushran { 3522ed6c750SSunil Mushran struct hlist_head *bucket; 3532ed6c750SSunil Mushran unsigned char *mname; 3542ed6c750SSunil Mushran unsigned int mlen, hash; 3552ed6c750SSunil Mushran 3561c084577SSunil Mushran assert_spin_locked(&dlm->master_lock); 3571c084577SSunil Mushran 3582ed6c750SSunil Mushran __dlm_mle_name(mle, &mname, &mlen, &hash); 3592ed6c750SSunil Mushran bucket = dlm_master_hash(dlm, hash); 3602ed6c750SSunil Mushran hlist_add_head(&mle->master_hash_node, bucket); 3611c084577SSunil Mushran } 3626714d8e8SKurt Hackel 3636714d8e8SKurt Hackel /* returns 1 if found, 0 if not */ 3646714d8e8SKurt Hackel static int dlm_find_mle(struct dlm_ctxt *dlm, 3656714d8e8SKurt Hackel struct dlm_master_list_entry **mle, 3666714d8e8SKurt Hackel char *name, unsigned int namelen) 3676714d8e8SKurt Hackel { 3686714d8e8SKurt Hackel struct dlm_master_list_entry *tmpmle; 3692ed6c750SSunil Mushran struct hlist_head *bucket; 3702ed6c750SSunil Mushran struct hlist_node *list; 3712ed6c750SSunil Mushran unsigned int hash; 3726714d8e8SKurt Hackel 3736714d8e8SKurt Hackel assert_spin_locked(&dlm->master_lock); 3746714d8e8SKurt Hackel 3752ed6c750SSunil Mushran hash = dlm_lockid_hash(name, namelen); 3762ed6c750SSunil Mushran bucket = dlm_master_hash(dlm, hash); 3772ed6c750SSunil Mushran hlist_for_each(list, bucket) { 3782ed6c750SSunil Mushran tmpmle = hlist_entry(list, struct dlm_master_list_entry, 3792ed6c750SSunil Mushran master_hash_node); 3806714d8e8SKurt Hackel if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 3816714d8e8SKurt Hackel continue; 3826714d8e8SKurt Hackel dlm_get_mle(tmpmle); 3836714d8e8SKurt Hackel *mle = tmpmle; 3846714d8e8SKurt Hackel return 1; 3856714d8e8SKurt Hackel } 3866714d8e8SKurt Hackel return 0; 3876714d8e8SKurt Hackel } 3886714d8e8SKurt Hackel 3896714d8e8SKurt Hackel void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) 3906714d8e8SKurt Hackel { 3916714d8e8SKurt Hackel struct dlm_master_list_entry *mle; 3926714d8e8SKurt Hackel 3936714d8e8SKurt Hackel assert_spin_locked(&dlm->spinlock); 3946714d8e8SKurt Hackel 395800deef3SChristoph Hellwig list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { 3966714d8e8SKurt Hackel if (node_up) 3976714d8e8SKurt Hackel dlm_mle_node_up(dlm, mle, NULL, idx); 3986714d8e8SKurt Hackel else 3996714d8e8SKurt Hackel dlm_mle_node_down(dlm, mle, NULL, idx); 4006714d8e8SKurt Hackel } 4016714d8e8SKurt Hackel } 4026714d8e8SKurt Hackel 4036714d8e8SKurt Hackel static void dlm_mle_node_down(struct dlm_ctxt *dlm, 4046714d8e8SKurt Hackel struct dlm_master_list_entry *mle, 4056714d8e8SKurt Hackel struct o2nm_node *node, int idx) 4066714d8e8SKurt Hackel { 4076714d8e8SKurt Hackel spin_lock(&mle->spinlock); 4086714d8e8SKurt Hackel 4096714d8e8SKurt Hackel if (!test_bit(idx, mle->node_map)) 4106714d8e8SKurt Hackel mlog(0, "node %u already removed from nodemap!\n", idx); 4116714d8e8SKurt Hackel else 4126714d8e8SKurt Hackel clear_bit(idx, mle->node_map); 4136714d8e8SKurt Hackel 4146714d8e8SKurt Hackel spin_unlock(&mle->spinlock); 4156714d8e8SKurt Hackel } 4166714d8e8SKurt Hackel 4176714d8e8SKurt Hackel static void dlm_mle_node_up(struct dlm_ctxt *dlm, 4186714d8e8SKurt Hackel struct dlm_master_list_entry *mle, 4196714d8e8SKurt Hackel struct o2nm_node *node, int idx) 4206714d8e8SKurt Hackel { 4216714d8e8SKurt Hackel spin_lock(&mle->spinlock); 4226714d8e8SKurt Hackel 4236714d8e8SKurt Hackel if (test_bit(idx, mle->node_map)) 4246714d8e8SKurt Hackel mlog(0, "node %u already in node map!\n", idx); 4256714d8e8SKurt Hackel else 4266714d8e8SKurt Hackel set_bit(idx, mle->node_map); 4276714d8e8SKurt Hackel 4286714d8e8SKurt Hackel spin_unlock(&mle->spinlock); 4296714d8e8SKurt Hackel } 4306714d8e8SKurt Hackel 4316714d8e8SKurt Hackel 4326714d8e8SKurt Hackel int dlm_init_mle_cache(void) 4336714d8e8SKurt Hackel { 43412eb0035SSunil Mushran dlm_mle_cache = kmem_cache_create("o2dlm_mle", 4356714d8e8SKurt Hackel sizeof(struct dlm_master_list_entry), 4366714d8e8SKurt Hackel 0, SLAB_HWCACHE_ALIGN, 43720c2df83SPaul Mundt NULL); 4386714d8e8SKurt Hackel if (dlm_mle_cache == NULL) 4396714d8e8SKurt Hackel return -ENOMEM; 4406714d8e8SKurt Hackel return 0; 4416714d8e8SKurt Hackel } 4426714d8e8SKurt Hackel 4436714d8e8SKurt Hackel void dlm_destroy_mle_cache(void) 4446714d8e8SKurt Hackel { 4456714d8e8SKurt Hackel if (dlm_mle_cache) 4466714d8e8SKurt Hackel kmem_cache_destroy(dlm_mle_cache); 4476714d8e8SKurt Hackel } 4486714d8e8SKurt Hackel 4496714d8e8SKurt Hackel static void dlm_mle_release(struct kref *kref) 4506714d8e8SKurt Hackel { 4516714d8e8SKurt Hackel struct dlm_master_list_entry *mle; 4526714d8e8SKurt Hackel struct dlm_ctxt *dlm; 4532ed6c750SSunil Mushran unsigned char *mname; 4542ed6c750SSunil Mushran unsigned int mlen; 4556714d8e8SKurt Hackel 4566714d8e8SKurt Hackel mlog_entry_void(); 4576714d8e8SKurt Hackel 4586714d8e8SKurt Hackel mle = container_of(kref, struct dlm_master_list_entry, mle_refs); 4596714d8e8SKurt Hackel dlm = mle->dlm; 4606714d8e8SKurt Hackel 4616714d8e8SKurt Hackel assert_spin_locked(&dlm->spinlock); 4626714d8e8SKurt Hackel assert_spin_locked(&dlm->master_lock); 4636714d8e8SKurt Hackel 4642ed6c750SSunil Mushran __dlm_mle_name(mle, &mname, &mlen, NULL); 4652ed6c750SSunil Mushran mlog(0, "Releasing mle for %.*s, type %d\n", mlen, mname, mle->type); 4662ed6c750SSunil Mushran 4676714d8e8SKurt Hackel /* remove from list if not already */ 4681c084577SSunil Mushran __dlm_unlink_mle(dlm, mle); 4696714d8e8SKurt Hackel 4706714d8e8SKurt Hackel /* detach the mle from the domain node up/down events */ 4716714d8e8SKurt Hackel __dlm_mle_detach_hb_events(dlm, mle); 4726714d8e8SKurt Hackel 4732041d8fdSSunil Mushran atomic_dec(&dlm->mle_cur_count[mle->type]); 4742041d8fdSSunil Mushran 4756714d8e8SKurt Hackel /* NOTE: kfree under spinlock here. 4766714d8e8SKurt Hackel * if this is bad, we can move this to a freelist. */ 4776714d8e8SKurt Hackel kmem_cache_free(dlm_mle_cache, mle); 4786714d8e8SKurt Hackel } 4796714d8e8SKurt Hackel 4806714d8e8SKurt Hackel 4816714d8e8SKurt Hackel /* 4826714d8e8SKurt Hackel * LOCK RESOURCE FUNCTIONS 4836714d8e8SKurt Hackel */ 4846714d8e8SKurt Hackel 485724bdca9SSunil Mushran int dlm_init_master_caches(void) 486724bdca9SSunil Mushran { 487724bdca9SSunil Mushran dlm_lockres_cache = kmem_cache_create("o2dlm_lockres", 488724bdca9SSunil Mushran sizeof(struct dlm_lock_resource), 489724bdca9SSunil Mushran 0, SLAB_HWCACHE_ALIGN, NULL); 490724bdca9SSunil Mushran if (!dlm_lockres_cache) 491724bdca9SSunil Mushran goto bail; 492724bdca9SSunil Mushran 493724bdca9SSunil Mushran dlm_lockname_cache = kmem_cache_create("o2dlm_lockname", 494724bdca9SSunil Mushran DLM_LOCKID_NAME_MAX, 0, 495724bdca9SSunil Mushran SLAB_HWCACHE_ALIGN, NULL); 496724bdca9SSunil Mushran if (!dlm_lockname_cache) 497724bdca9SSunil Mushran goto bail; 498724bdca9SSunil Mushran 499724bdca9SSunil Mushran return 0; 500724bdca9SSunil Mushran bail: 501724bdca9SSunil Mushran dlm_destroy_master_caches(); 502724bdca9SSunil Mushran return -ENOMEM; 503724bdca9SSunil Mushran } 504724bdca9SSunil Mushran 505724bdca9SSunil Mushran void dlm_destroy_master_caches(void) 506724bdca9SSunil Mushran { 507724bdca9SSunil Mushran if (dlm_lockname_cache) 508724bdca9SSunil Mushran kmem_cache_destroy(dlm_lockname_cache); 509724bdca9SSunil Mushran 510724bdca9SSunil Mushran if (dlm_lockres_cache) 511724bdca9SSunil Mushran kmem_cache_destroy(dlm_lockres_cache); 512724bdca9SSunil Mushran } 513724bdca9SSunil Mushran 5146714d8e8SKurt Hackel static void dlm_set_lockres_owner(struct dlm_ctxt *dlm, 5156714d8e8SKurt Hackel struct dlm_lock_resource *res, 5166714d8e8SKurt Hackel u8 owner) 5176714d8e8SKurt Hackel { 5186714d8e8SKurt Hackel assert_spin_locked(&res->spinlock); 5196714d8e8SKurt Hackel 5206714d8e8SKurt Hackel res->owner = owner; 5216714d8e8SKurt Hackel } 5226714d8e8SKurt Hackel 5236714d8e8SKurt Hackel void dlm_change_lockres_owner(struct dlm_ctxt *dlm, 5246714d8e8SKurt Hackel struct dlm_lock_resource *res, u8 owner) 5256714d8e8SKurt Hackel { 5266714d8e8SKurt Hackel assert_spin_locked(&res->spinlock); 5276714d8e8SKurt Hackel 5286800791aSSunil Mushran if (owner != res->owner) 5296714d8e8SKurt Hackel dlm_set_lockres_owner(dlm, res, owner); 5306714d8e8SKurt Hackel } 5316714d8e8SKurt Hackel 5326714d8e8SKurt Hackel 5336714d8e8SKurt Hackel static void dlm_lockres_release(struct kref *kref) 5346714d8e8SKurt Hackel { 5356714d8e8SKurt Hackel struct dlm_lock_resource *res; 536b0d4f817SSunil Mushran struct dlm_ctxt *dlm; 5376714d8e8SKurt Hackel 5386714d8e8SKurt Hackel res = container_of(kref, struct dlm_lock_resource, refs); 539b0d4f817SSunil Mushran dlm = res->dlm; 5406714d8e8SKurt Hackel 5416714d8e8SKurt Hackel /* This should not happen -- all lockres' have a name 5426714d8e8SKurt Hackel * associated with them at init time. */ 5436714d8e8SKurt Hackel BUG_ON(!res->lockname.name); 5446714d8e8SKurt Hackel 5456714d8e8SKurt Hackel mlog(0, "destroying lockres %.*s\n", res->lockname.len, 5466714d8e8SKurt Hackel res->lockname.name); 5476714d8e8SKurt Hackel 548b0d4f817SSunil Mushran spin_lock(&dlm->track_lock); 54929576f8bSSunil Mushran if (!list_empty(&res->tracking)) 55029576f8bSSunil Mushran list_del_init(&res->tracking); 55129576f8bSSunil Mushran else { 55229576f8bSSunil Mushran mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", 55329576f8bSSunil Mushran res->lockname.len, res->lockname.name); 55429576f8bSSunil Mushran dlm_print_one_lock_resource(res); 55529576f8bSSunil Mushran } 556b0d4f817SSunil Mushran spin_unlock(&dlm->track_lock); 557b0d4f817SSunil Mushran 5586800791aSSunil Mushran atomic_dec(&dlm->res_cur_count); 5596800791aSSunil Mushran 560b0d4f817SSunil Mushran dlm_put(dlm); 56129576f8bSSunil Mushran 562a7f90d83SKurt Hackel if (!hlist_unhashed(&res->hash_node) || 563a7f90d83SKurt Hackel !list_empty(&res->granted) || 564a7f90d83SKurt Hackel !list_empty(&res->converting) || 565a7f90d83SKurt Hackel !list_empty(&res->blocked) || 566a7f90d83SKurt Hackel !list_empty(&res->dirty) || 567a7f90d83SKurt Hackel !list_empty(&res->recovering) || 568a7f90d83SKurt Hackel !list_empty(&res->purge)) { 569a7f90d83SKurt Hackel mlog(ML_ERROR, 570a7f90d83SKurt Hackel "Going to BUG for resource %.*s." 571a7f90d83SKurt Hackel " We're on a list! [%c%c%c%c%c%c%c]\n", 572a7f90d83SKurt Hackel res->lockname.len, res->lockname.name, 573a7f90d83SKurt Hackel !hlist_unhashed(&res->hash_node) ? 'H' : ' ', 574a7f90d83SKurt Hackel !list_empty(&res->granted) ? 'G' : ' ', 575a7f90d83SKurt Hackel !list_empty(&res->converting) ? 'C' : ' ', 576a7f90d83SKurt Hackel !list_empty(&res->blocked) ? 'B' : ' ', 577a7f90d83SKurt Hackel !list_empty(&res->dirty) ? 'D' : ' ', 578a7f90d83SKurt Hackel !list_empty(&res->recovering) ? 'R' : ' ', 579a7f90d83SKurt Hackel !list_empty(&res->purge) ? 'P' : ' '); 580a7f90d83SKurt Hackel 581a7f90d83SKurt Hackel dlm_print_one_lock_resource(res); 582a7f90d83SKurt Hackel } 583a7f90d83SKurt Hackel 5846714d8e8SKurt Hackel /* By the time we're ready to blow this guy away, we shouldn't 5856714d8e8SKurt Hackel * be on any lists. */ 58681f2094aSMark Fasheh BUG_ON(!hlist_unhashed(&res->hash_node)); 5876714d8e8SKurt Hackel BUG_ON(!list_empty(&res->granted)); 5886714d8e8SKurt Hackel BUG_ON(!list_empty(&res->converting)); 5896714d8e8SKurt Hackel BUG_ON(!list_empty(&res->blocked)); 5906714d8e8SKurt Hackel BUG_ON(!list_empty(&res->dirty)); 5916714d8e8SKurt Hackel BUG_ON(!list_empty(&res->recovering)); 5926714d8e8SKurt Hackel BUG_ON(!list_empty(&res->purge)); 5936714d8e8SKurt Hackel 594724bdca9SSunil Mushran kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); 5956714d8e8SKurt Hackel 596724bdca9SSunil Mushran kmem_cache_free(dlm_lockres_cache, res); 5976714d8e8SKurt Hackel } 5986714d8e8SKurt Hackel 5996714d8e8SKurt Hackel void dlm_lockres_put(struct dlm_lock_resource *res) 6006714d8e8SKurt Hackel { 6016714d8e8SKurt Hackel kref_put(&res->refs, dlm_lockres_release); 6026714d8e8SKurt Hackel } 6036714d8e8SKurt Hackel 6046714d8e8SKurt Hackel static void dlm_init_lockres(struct dlm_ctxt *dlm, 6056714d8e8SKurt Hackel struct dlm_lock_resource *res, 6066714d8e8SKurt Hackel const char *name, unsigned int namelen) 6076714d8e8SKurt Hackel { 6086714d8e8SKurt Hackel char *qname; 6096714d8e8SKurt Hackel 6106714d8e8SKurt Hackel /* If we memset here, we lose our reference to the kmalloc'd 6116714d8e8SKurt Hackel * res->lockname.name, so be sure to init every field 6126714d8e8SKurt Hackel * correctly! */ 6136714d8e8SKurt Hackel 6146714d8e8SKurt Hackel qname = (char *) res->lockname.name; 6156714d8e8SKurt Hackel memcpy(qname, name, namelen); 6166714d8e8SKurt Hackel 6176714d8e8SKurt Hackel res->lockname.len = namelen; 618a3d33291SMark Fasheh res->lockname.hash = dlm_lockid_hash(name, namelen); 6196714d8e8SKurt Hackel 6206714d8e8SKurt Hackel init_waitqueue_head(&res->wq); 6216714d8e8SKurt Hackel spin_lock_init(&res->spinlock); 62281f2094aSMark Fasheh INIT_HLIST_NODE(&res->hash_node); 6236714d8e8SKurt Hackel INIT_LIST_HEAD(&res->granted); 6246714d8e8SKurt Hackel INIT_LIST_HEAD(&res->converting); 6256714d8e8SKurt Hackel INIT_LIST_HEAD(&res->blocked); 6266714d8e8SKurt Hackel INIT_LIST_HEAD(&res->dirty); 6276714d8e8SKurt Hackel INIT_LIST_HEAD(&res->recovering); 6286714d8e8SKurt Hackel INIT_LIST_HEAD(&res->purge); 62929576f8bSSunil Mushran INIT_LIST_HEAD(&res->tracking); 6306714d8e8SKurt Hackel atomic_set(&res->asts_reserved, 0); 6316714d8e8SKurt Hackel res->migration_pending = 0; 632ba2bf218SKurt Hackel res->inflight_locks = 0; 6336714d8e8SKurt Hackel 634b0d4f817SSunil Mushran /* put in dlm_lockres_release */ 635b0d4f817SSunil Mushran dlm_grab(dlm); 636b0d4f817SSunil Mushran res->dlm = dlm; 637b0d4f817SSunil Mushran 6386714d8e8SKurt Hackel kref_init(&res->refs); 6396714d8e8SKurt Hackel 6406800791aSSunil Mushran atomic_inc(&dlm->res_tot_count); 6416800791aSSunil Mushran atomic_inc(&dlm->res_cur_count); 6426800791aSSunil Mushran 6436714d8e8SKurt Hackel /* just for consistency */ 6446714d8e8SKurt Hackel spin_lock(&res->spinlock); 6456714d8e8SKurt Hackel dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); 6466714d8e8SKurt Hackel spin_unlock(&res->spinlock); 6476714d8e8SKurt Hackel 6486714d8e8SKurt Hackel res->state = DLM_LOCK_RES_IN_PROGRESS; 6496714d8e8SKurt Hackel 6506714d8e8SKurt Hackel res->last_used = 0; 6516714d8e8SKurt Hackel 65218c6ac38SSunil Mushran spin_lock(&dlm->spinlock); 65329576f8bSSunil Mushran list_add_tail(&res->tracking, &dlm->tracking_list); 65418c6ac38SSunil Mushran spin_unlock(&dlm->spinlock); 65529576f8bSSunil Mushran 6566714d8e8SKurt Hackel memset(res->lvb, 0, DLM_LVB_LEN); 657ba2bf218SKurt Hackel memset(res->refmap, 0, sizeof(res->refmap)); 6586714d8e8SKurt Hackel } 6596714d8e8SKurt Hackel 6606714d8e8SKurt Hackel struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, 6616714d8e8SKurt Hackel const char *name, 6626714d8e8SKurt Hackel unsigned int namelen) 6636714d8e8SKurt Hackel { 664724bdca9SSunil Mushran struct dlm_lock_resource *res = NULL; 6656714d8e8SKurt Hackel 666724bdca9SSunil Mushran res = (struct dlm_lock_resource *) 667724bdca9SSunil Mushran kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); 6686714d8e8SKurt Hackel if (!res) 669724bdca9SSunil Mushran goto error; 6706714d8e8SKurt Hackel 671724bdca9SSunil Mushran res->lockname.name = (char *) 672724bdca9SSunil Mushran kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); 673724bdca9SSunil Mushran if (!res->lockname.name) 674724bdca9SSunil Mushran goto error; 6756714d8e8SKurt Hackel 6766714d8e8SKurt Hackel dlm_init_lockres(dlm, res, name, namelen); 6776714d8e8SKurt Hackel return res; 678724bdca9SSunil Mushran 679724bdca9SSunil Mushran error: 680724bdca9SSunil Mushran if (res && res->lockname.name) 681724bdca9SSunil Mushran kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); 682724bdca9SSunil Mushran 683724bdca9SSunil Mushran if (res) 684724bdca9SSunil Mushran kmem_cache_free(dlm_lockres_cache, res); 685724bdca9SSunil Mushran return NULL; 6866714d8e8SKurt Hackel } 6876714d8e8SKurt Hackel 688ba2bf218SKurt Hackel void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 689ba2bf218SKurt Hackel struct dlm_lock_resource *res, 690ba2bf218SKurt Hackel int new_lockres, 691ba2bf218SKurt Hackel const char *file, 692ba2bf218SKurt Hackel int line) 693ba2bf218SKurt Hackel { 694ba2bf218SKurt Hackel if (!new_lockres) 695ba2bf218SKurt Hackel assert_spin_locked(&res->spinlock); 696ba2bf218SKurt Hackel 697ba2bf218SKurt Hackel if (!test_bit(dlm->node_num, res->refmap)) { 698ba2bf218SKurt Hackel BUG_ON(res->inflight_locks != 0); 699ba2bf218SKurt Hackel dlm_lockres_set_refmap_bit(dlm->node_num, res); 700ba2bf218SKurt Hackel } 701ba2bf218SKurt Hackel res->inflight_locks++; 702ba2bf218SKurt Hackel mlog(0, "%s:%.*s: inflight++: now %u\n", 703ba2bf218SKurt Hackel dlm->name, res->lockname.len, res->lockname.name, 704ba2bf218SKurt Hackel res->inflight_locks); 705ba2bf218SKurt Hackel } 706ba2bf218SKurt Hackel 707ba2bf218SKurt Hackel void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 708ba2bf218SKurt Hackel struct dlm_lock_resource *res, 709ba2bf218SKurt Hackel const char *file, 710ba2bf218SKurt Hackel int line) 711ba2bf218SKurt Hackel { 712ba2bf218SKurt Hackel assert_spin_locked(&res->spinlock); 713ba2bf218SKurt Hackel 714ba2bf218SKurt Hackel BUG_ON(res->inflight_locks == 0); 715ba2bf218SKurt Hackel res->inflight_locks--; 716ba2bf218SKurt Hackel mlog(0, "%s:%.*s: inflight--: now %u\n", 717ba2bf218SKurt Hackel dlm->name, res->lockname.len, res->lockname.name, 718ba2bf218SKurt Hackel res->inflight_locks); 719ba2bf218SKurt Hackel if (res->inflight_locks == 0) 720ba2bf218SKurt Hackel dlm_lockres_clear_refmap_bit(dlm->node_num, res); 721ba2bf218SKurt Hackel wake_up(&res->wq); 722ba2bf218SKurt Hackel } 723ba2bf218SKurt Hackel 7246714d8e8SKurt Hackel /* 7256714d8e8SKurt Hackel * lookup a lock resource by name. 7266714d8e8SKurt Hackel * may already exist in the hashtable. 7276714d8e8SKurt Hackel * lockid is null terminated 7286714d8e8SKurt Hackel * 7296714d8e8SKurt Hackel * if not, allocate enough for the lockres and for 7306714d8e8SKurt Hackel * the temporary structure used in doing the mastering. 7316714d8e8SKurt Hackel * 7326714d8e8SKurt Hackel * also, do a lookup in the dlm->master_list to see 7336714d8e8SKurt Hackel * if another node has begun mastering the same lock. 7346714d8e8SKurt Hackel * if so, there should be a block entry in there 7356714d8e8SKurt Hackel * for this name, and we should *not* attempt to master 7366714d8e8SKurt Hackel * the lock here. need to wait around for that node 7376714d8e8SKurt Hackel * to assert_master (or die). 7386714d8e8SKurt Hackel * 7396714d8e8SKurt Hackel */ 7406714d8e8SKurt Hackel struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, 7416714d8e8SKurt Hackel const char *lockid, 7423384f3dfSMark Fasheh int namelen, 7436714d8e8SKurt Hackel int flags) 7446714d8e8SKurt Hackel { 7456714d8e8SKurt Hackel struct dlm_lock_resource *tmpres=NULL, *res=NULL; 7466714d8e8SKurt Hackel struct dlm_master_list_entry *mle = NULL; 7476714d8e8SKurt Hackel struct dlm_master_list_entry *alloc_mle = NULL; 7486714d8e8SKurt Hackel int blocked = 0; 7496714d8e8SKurt Hackel int ret, nodenum; 7506714d8e8SKurt Hackel struct dlm_node_iter iter; 7513384f3dfSMark Fasheh unsigned int hash; 7526714d8e8SKurt Hackel int tries = 0; 753c03872f5SKurt Hackel int bit, wait_on_recovery = 0; 754ba2bf218SKurt Hackel int drop_inflight_if_nonlocal = 0; 7556714d8e8SKurt Hackel 7566714d8e8SKurt Hackel BUG_ON(!lockid); 7576714d8e8SKurt Hackel 758a3d33291SMark Fasheh hash = dlm_lockid_hash(lockid, namelen); 7596714d8e8SKurt Hackel 7606714d8e8SKurt Hackel mlog(0, "get lockres %s (len %d)\n", lockid, namelen); 7616714d8e8SKurt Hackel 7626714d8e8SKurt Hackel lookup: 7636714d8e8SKurt Hackel spin_lock(&dlm->spinlock); 764ba2bf218SKurt Hackel tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); 7656714d8e8SKurt Hackel if (tmpres) { 766ba2bf218SKurt Hackel int dropping_ref = 0; 767ba2bf218SKurt Hackel 7687b791d68SSunil Mushran spin_unlock(&dlm->spinlock); 7697b791d68SSunil Mushran 770ba2bf218SKurt Hackel spin_lock(&tmpres->spinlock); 7717b791d68SSunil Mushran /* We wait for the other thread that is mastering the resource */ 7727b791d68SSunil Mushran if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { 7737b791d68SSunil Mushran __dlm_wait_on_lockres(tmpres); 7747b791d68SSunil Mushran BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); 7757b791d68SSunil Mushran } 7767b791d68SSunil Mushran 777ba2bf218SKurt Hackel if (tmpres->owner == dlm->node_num) { 778ba2bf218SKurt Hackel BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); 779ba2bf218SKurt Hackel dlm_lockres_grab_inflight_ref(dlm, tmpres); 780ba2bf218SKurt Hackel } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) 781ba2bf218SKurt Hackel dropping_ref = 1; 782ba2bf218SKurt Hackel spin_unlock(&tmpres->spinlock); 783ba2bf218SKurt Hackel 784ba2bf218SKurt Hackel /* wait until done messaging the master, drop our ref to allow 785ba2bf218SKurt Hackel * the lockres to be purged, start over. */ 786ba2bf218SKurt Hackel if (dropping_ref) { 787ba2bf218SKurt Hackel spin_lock(&tmpres->spinlock); 788ba2bf218SKurt Hackel __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); 789ba2bf218SKurt Hackel spin_unlock(&tmpres->spinlock); 790ba2bf218SKurt Hackel dlm_lockres_put(tmpres); 791ba2bf218SKurt Hackel tmpres = NULL; 792ba2bf218SKurt Hackel goto lookup; 793ba2bf218SKurt Hackel } 794ba2bf218SKurt Hackel 7956714d8e8SKurt Hackel mlog(0, "found in hash!\n"); 7966714d8e8SKurt Hackel if (res) 7976714d8e8SKurt Hackel dlm_lockres_put(res); 7986714d8e8SKurt Hackel res = tmpres; 7996714d8e8SKurt Hackel goto leave; 8006714d8e8SKurt Hackel } 8016714d8e8SKurt Hackel 8026714d8e8SKurt Hackel if (!res) { 8036714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 8046714d8e8SKurt Hackel mlog(0, "allocating a new resource\n"); 8056714d8e8SKurt Hackel /* nothing found and we need to allocate one. */ 8066714d8e8SKurt Hackel alloc_mle = (struct dlm_master_list_entry *) 807ad8100e0SKurt Hackel kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); 8086714d8e8SKurt Hackel if (!alloc_mle) 8096714d8e8SKurt Hackel goto leave; 8106714d8e8SKurt Hackel res = dlm_new_lockres(dlm, lockid, namelen); 8116714d8e8SKurt Hackel if (!res) 8126714d8e8SKurt Hackel goto leave; 8136714d8e8SKurt Hackel goto lookup; 8146714d8e8SKurt Hackel } 8156714d8e8SKurt Hackel 8166714d8e8SKurt Hackel mlog(0, "no lockres found, allocated our own: %p\n", res); 8176714d8e8SKurt Hackel 8186714d8e8SKurt Hackel if (flags & LKM_LOCAL) { 8196714d8e8SKurt Hackel /* caller knows it's safe to assume it's not mastered elsewhere 8206714d8e8SKurt Hackel * DONE! return right away */ 8216714d8e8SKurt Hackel spin_lock(&res->spinlock); 8226714d8e8SKurt Hackel dlm_change_lockres_owner(dlm, res, dlm->node_num); 8236714d8e8SKurt Hackel __dlm_insert_lockres(dlm, res); 824ba2bf218SKurt Hackel dlm_lockres_grab_inflight_ref(dlm, res); 8256714d8e8SKurt Hackel spin_unlock(&res->spinlock); 8266714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 8276714d8e8SKurt Hackel /* lockres still marked IN_PROGRESS */ 8286714d8e8SKurt Hackel goto wake_waiters; 8296714d8e8SKurt Hackel } 8306714d8e8SKurt Hackel 8316714d8e8SKurt Hackel /* check master list to see if another node has started mastering it */ 8326714d8e8SKurt Hackel spin_lock(&dlm->master_lock); 8336714d8e8SKurt Hackel 8346714d8e8SKurt Hackel /* if we found a block, wait for lock to be mastered by another node */ 8356714d8e8SKurt Hackel blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); 8366714d8e8SKurt Hackel if (blocked) { 837ba2bf218SKurt Hackel int mig; 8386714d8e8SKurt Hackel if (mle->type == DLM_MLE_MASTER) { 8396714d8e8SKurt Hackel mlog(ML_ERROR, "master entry for nonexistent lock!\n"); 8406714d8e8SKurt Hackel BUG(); 841ba2bf218SKurt Hackel } 842ba2bf218SKurt Hackel mig = (mle->type == DLM_MLE_MIGRATION); 843ba2bf218SKurt Hackel /* if there is a migration in progress, let the migration 844ba2bf218SKurt Hackel * finish before continuing. we can wait for the absence 845ba2bf218SKurt Hackel * of the MIGRATION mle: either the migrate finished or 846ba2bf218SKurt Hackel * one of the nodes died and the mle was cleaned up. 847ba2bf218SKurt Hackel * if there is a BLOCK here, but it already has a master 848ba2bf218SKurt Hackel * set, we are too late. the master does not have a ref 849ba2bf218SKurt Hackel * for us in the refmap. detach the mle and drop it. 850ba2bf218SKurt Hackel * either way, go back to the top and start over. */ 851ba2bf218SKurt Hackel if (mig || mle->master != O2NM_MAX_NODES) { 852ba2bf218SKurt Hackel BUG_ON(mig && mle->master == dlm->node_num); 853ba2bf218SKurt Hackel /* we arrived too late. the master does not 854ba2bf218SKurt Hackel * have a ref for us. retry. */ 855ba2bf218SKurt Hackel mlog(0, "%s:%.*s: late on %s\n", 856ba2bf218SKurt Hackel dlm->name, namelen, lockid, 857ba2bf218SKurt Hackel mig ? "MIGRATION" : "BLOCK"); 8586714d8e8SKurt Hackel spin_unlock(&dlm->master_lock); 8596714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 8606714d8e8SKurt Hackel 8616714d8e8SKurt Hackel /* master is known, detach */ 862ba2bf218SKurt Hackel if (!mig) 8636714d8e8SKurt Hackel dlm_mle_detach_hb_events(dlm, mle); 8646714d8e8SKurt Hackel dlm_put_mle(mle); 8656714d8e8SKurt Hackel mle = NULL; 866ba2bf218SKurt Hackel /* this is lame, but we cant wait on either 867ba2bf218SKurt Hackel * the mle or lockres waitqueue here */ 868ba2bf218SKurt Hackel if (mig) 869ba2bf218SKurt Hackel msleep(100); 870ba2bf218SKurt Hackel goto lookup; 8716714d8e8SKurt Hackel } 8726714d8e8SKurt Hackel } else { 8736714d8e8SKurt Hackel /* go ahead and try to master lock on this node */ 8746714d8e8SKurt Hackel mle = alloc_mle; 8756714d8e8SKurt Hackel /* make sure this does not get freed below */ 8766714d8e8SKurt Hackel alloc_mle = NULL; 8776714d8e8SKurt Hackel dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); 8786714d8e8SKurt Hackel set_bit(dlm->node_num, mle->maybe_map); 8791c084577SSunil Mushran __dlm_insert_mle(dlm, mle); 880c03872f5SKurt Hackel 881c03872f5SKurt Hackel /* still holding the dlm spinlock, check the recovery map 882c03872f5SKurt Hackel * to see if there are any nodes that still need to be 883c03872f5SKurt Hackel * considered. these will not appear in the mle nodemap 884c03872f5SKurt Hackel * but they might own this lockres. wait on them. */ 885c03872f5SKurt Hackel bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 886c03872f5SKurt Hackel if (bit < O2NM_MAX_NODES) { 887c03872f5SKurt Hackel mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " 888c03872f5SKurt Hackel "recover before lock mastery can begin\n", 889c03872f5SKurt Hackel dlm->name, namelen, (char *)lockid, bit); 890c03872f5SKurt Hackel wait_on_recovery = 1; 891c03872f5SKurt Hackel } 8926714d8e8SKurt Hackel } 8936714d8e8SKurt Hackel 8946714d8e8SKurt Hackel /* at this point there is either a DLM_MLE_BLOCK or a 8956714d8e8SKurt Hackel * DLM_MLE_MASTER on the master list, so it's safe to add the 8966714d8e8SKurt Hackel * lockres to the hashtable. anyone who finds the lock will 8976714d8e8SKurt Hackel * still have to wait on the IN_PROGRESS. */ 8986714d8e8SKurt Hackel 8996714d8e8SKurt Hackel /* finally add the lockres to its hash bucket */ 9006714d8e8SKurt Hackel __dlm_insert_lockres(dlm, res); 901ba2bf218SKurt Hackel /* since this lockres is new it doesnt not require the spinlock */ 902ba2bf218SKurt Hackel dlm_lockres_grab_inflight_ref_new(dlm, res); 903ba2bf218SKurt Hackel 904ba2bf218SKurt Hackel /* if this node does not become the master make sure to drop 905ba2bf218SKurt Hackel * this inflight reference below */ 906ba2bf218SKurt Hackel drop_inflight_if_nonlocal = 1; 907ba2bf218SKurt Hackel 9086714d8e8SKurt Hackel /* get an extra ref on the mle in case this is a BLOCK 9096714d8e8SKurt Hackel * if so, the creator of the BLOCK may try to put the last 9106714d8e8SKurt Hackel * ref at this time in the assert master handler, so we 9116714d8e8SKurt Hackel * need an extra one to keep from a bad ptr deref. */ 912a2bf0477SKurt Hackel dlm_get_mle_inuse(mle); 9136714d8e8SKurt Hackel spin_unlock(&dlm->master_lock); 9146714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 9156714d8e8SKurt Hackel 916e7e69eb3SKurt Hackel redo_request: 917c03872f5SKurt Hackel while (wait_on_recovery) { 918c03872f5SKurt Hackel /* any cluster changes that occurred after dropping the 919c03872f5SKurt Hackel * dlm spinlock would be detectable be a change on the mle, 920c03872f5SKurt Hackel * so we only need to clear out the recovery map once. */ 921c03872f5SKurt Hackel if (dlm_is_recovery_lock(lockid, namelen)) { 922c03872f5SKurt Hackel mlog(ML_NOTICE, "%s: recovery map is not empty, but " 923c03872f5SKurt Hackel "must master $RECOVERY lock now\n", dlm->name); 924c03872f5SKurt Hackel if (!dlm_pre_master_reco_lockres(dlm, res)) 925c03872f5SKurt Hackel wait_on_recovery = 0; 926c03872f5SKurt Hackel else { 927c03872f5SKurt Hackel mlog(0, "%s: waiting 500ms for heartbeat state " 928c03872f5SKurt Hackel "change\n", dlm->name); 929c03872f5SKurt Hackel msleep(500); 930c03872f5SKurt Hackel } 931c03872f5SKurt Hackel continue; 932c03872f5SKurt Hackel } 933c03872f5SKurt Hackel 934c03872f5SKurt Hackel dlm_kick_recovery_thread(dlm); 935aa087b84SKurt Hackel msleep(1000); 936c03872f5SKurt Hackel dlm_wait_for_recovery(dlm); 937c03872f5SKurt Hackel 938c03872f5SKurt Hackel spin_lock(&dlm->spinlock); 939c03872f5SKurt Hackel bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 940c03872f5SKurt Hackel if (bit < O2NM_MAX_NODES) { 941c03872f5SKurt Hackel mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " 942c03872f5SKurt Hackel "recover before lock mastery can begin\n", 943c03872f5SKurt Hackel dlm->name, namelen, (char *)lockid, bit); 944c03872f5SKurt Hackel wait_on_recovery = 1; 945c03872f5SKurt Hackel } else 946c03872f5SKurt Hackel wait_on_recovery = 0; 947c03872f5SKurt Hackel spin_unlock(&dlm->spinlock); 948b7084ab5SKurt Hackel 949b7084ab5SKurt Hackel if (wait_on_recovery) 950b7084ab5SKurt Hackel dlm_wait_for_node_recovery(dlm, bit, 10000); 951c03872f5SKurt Hackel } 952c03872f5SKurt Hackel 9536714d8e8SKurt Hackel /* must wait for lock to be mastered elsewhere */ 9546714d8e8SKurt Hackel if (blocked) 9556714d8e8SKurt Hackel goto wait; 9566714d8e8SKurt Hackel 9576714d8e8SKurt Hackel ret = -EINVAL; 9586714d8e8SKurt Hackel dlm_node_iter_init(mle->vote_map, &iter); 9596714d8e8SKurt Hackel while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { 960ba2bf218SKurt Hackel ret = dlm_do_master_request(res, mle, nodenum); 9616714d8e8SKurt Hackel if (ret < 0) 9626714d8e8SKurt Hackel mlog_errno(ret); 9636714d8e8SKurt Hackel if (mle->master != O2NM_MAX_NODES) { 9646714d8e8SKurt Hackel /* found a master ! */ 9659c6510a5SKurt Hackel if (mle->master <= nodenum) 9666714d8e8SKurt Hackel break; 9679c6510a5SKurt Hackel /* if our master request has not reached the master 9689c6510a5SKurt Hackel * yet, keep going until it does. this is how the 9699c6510a5SKurt Hackel * master will know that asserts are needed back to 9709c6510a5SKurt Hackel * the lower nodes. */ 9719c6510a5SKurt Hackel mlog(0, "%s:%.*s: requests only up to %u but master " 9729c6510a5SKurt Hackel "is %u, keep going\n", dlm->name, namelen, 9739c6510a5SKurt Hackel lockid, nodenum, mle->master); 9746714d8e8SKurt Hackel } 9756714d8e8SKurt Hackel } 9766714d8e8SKurt Hackel 9776714d8e8SKurt Hackel wait: 9786714d8e8SKurt Hackel /* keep going until the response map includes all nodes */ 9796714d8e8SKurt Hackel ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); 9806714d8e8SKurt Hackel if (ret < 0) { 981e7e69eb3SKurt Hackel wait_on_recovery = 1; 9826714d8e8SKurt Hackel mlog(0, "%s:%.*s: node map changed, redo the " 9836714d8e8SKurt Hackel "master request now, blocked=%d\n", 9846714d8e8SKurt Hackel dlm->name, res->lockname.len, 9856714d8e8SKurt Hackel res->lockname.name, blocked); 9866714d8e8SKurt Hackel if (++tries > 20) { 9876714d8e8SKurt Hackel mlog(ML_ERROR, "%s:%.*s: spinning on " 9886714d8e8SKurt Hackel "dlm_wait_for_lock_mastery, blocked=%d\n", 9896714d8e8SKurt Hackel dlm->name, res->lockname.len, 9906714d8e8SKurt Hackel res->lockname.name, blocked); 9916714d8e8SKurt Hackel dlm_print_one_lock_resource(res); 9928a9343faSMark Fasheh dlm_print_one_mle(mle); 9936714d8e8SKurt Hackel tries = 0; 9946714d8e8SKurt Hackel } 9956714d8e8SKurt Hackel goto redo_request; 9966714d8e8SKurt Hackel } 9976714d8e8SKurt Hackel 9986714d8e8SKurt Hackel mlog(0, "lockres mastered by %u\n", res->owner); 9996714d8e8SKurt Hackel /* make sure we never continue without this */ 10006714d8e8SKurt Hackel BUG_ON(res->owner == O2NM_MAX_NODES); 10016714d8e8SKurt Hackel 10026714d8e8SKurt Hackel /* master is known, detach if not already detached */ 10036714d8e8SKurt Hackel dlm_mle_detach_hb_events(dlm, mle); 10046714d8e8SKurt Hackel dlm_put_mle(mle); 10056714d8e8SKurt Hackel /* put the extra ref */ 1006a2bf0477SKurt Hackel dlm_put_mle_inuse(mle); 10076714d8e8SKurt Hackel 10086714d8e8SKurt Hackel wake_waiters: 10096714d8e8SKurt Hackel spin_lock(&res->spinlock); 1010ba2bf218SKurt Hackel if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) 1011ba2bf218SKurt Hackel dlm_lockres_drop_inflight_ref(dlm, res); 10126714d8e8SKurt Hackel res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 10136714d8e8SKurt Hackel spin_unlock(&res->spinlock); 10146714d8e8SKurt Hackel wake_up(&res->wq); 10156714d8e8SKurt Hackel 10166714d8e8SKurt Hackel leave: 10176714d8e8SKurt Hackel /* need to free the unused mle */ 10186714d8e8SKurt Hackel if (alloc_mle) 10196714d8e8SKurt Hackel kmem_cache_free(dlm_mle_cache, alloc_mle); 10206714d8e8SKurt Hackel 10216714d8e8SKurt Hackel return res; 10226714d8e8SKurt Hackel } 10236714d8e8SKurt Hackel 10246714d8e8SKurt Hackel 10256714d8e8SKurt Hackel #define DLM_MASTERY_TIMEOUT_MS 5000 10266714d8e8SKurt Hackel 10276714d8e8SKurt Hackel static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, 10286714d8e8SKurt Hackel struct dlm_lock_resource *res, 10296714d8e8SKurt Hackel struct dlm_master_list_entry *mle, 10306714d8e8SKurt Hackel int *blocked) 10316714d8e8SKurt Hackel { 10326714d8e8SKurt Hackel u8 m; 10336714d8e8SKurt Hackel int ret, bit; 10346714d8e8SKurt Hackel int map_changed, voting_done; 10356714d8e8SKurt Hackel int assert, sleep; 10366714d8e8SKurt Hackel 10376714d8e8SKurt Hackel recheck: 10386714d8e8SKurt Hackel ret = 0; 10396714d8e8SKurt Hackel assert = 0; 10406714d8e8SKurt Hackel 10416714d8e8SKurt Hackel /* check if another node has already become the owner */ 10426714d8e8SKurt Hackel spin_lock(&res->spinlock); 10436714d8e8SKurt Hackel if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { 10449c6510a5SKurt Hackel mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name, 10459c6510a5SKurt Hackel res->lockname.len, res->lockname.name, res->owner); 10466714d8e8SKurt Hackel spin_unlock(&res->spinlock); 10479c6510a5SKurt Hackel /* this will cause the master to re-assert across 10489c6510a5SKurt Hackel * the whole cluster, freeing up mles */ 1049588e0090SKurt Hackel if (res->owner != dlm->node_num) { 1050ba2bf218SKurt Hackel ret = dlm_do_master_request(res, mle, res->owner); 10519c6510a5SKurt Hackel if (ret < 0) { 10529c6510a5SKurt Hackel /* give recovery a chance to run */ 10539c6510a5SKurt Hackel mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); 10549c6510a5SKurt Hackel msleep(500); 10559c6510a5SKurt Hackel goto recheck; 10569c6510a5SKurt Hackel } 1057588e0090SKurt Hackel } 10589c6510a5SKurt Hackel ret = 0; 10596714d8e8SKurt Hackel goto leave; 10606714d8e8SKurt Hackel } 10616714d8e8SKurt Hackel spin_unlock(&res->spinlock); 10626714d8e8SKurt Hackel 10636714d8e8SKurt Hackel spin_lock(&mle->spinlock); 10646714d8e8SKurt Hackel m = mle->master; 10656714d8e8SKurt Hackel map_changed = (memcmp(mle->vote_map, mle->node_map, 10666714d8e8SKurt Hackel sizeof(mle->vote_map)) != 0); 10676714d8e8SKurt Hackel voting_done = (memcmp(mle->vote_map, mle->response_map, 10686714d8e8SKurt Hackel sizeof(mle->vote_map)) == 0); 10696714d8e8SKurt Hackel 10706714d8e8SKurt Hackel /* restart if we hit any errors */ 10716714d8e8SKurt Hackel if (map_changed) { 10726714d8e8SKurt Hackel int b; 10736714d8e8SKurt Hackel mlog(0, "%s: %.*s: node map changed, restarting\n", 10746714d8e8SKurt Hackel dlm->name, res->lockname.len, res->lockname.name); 10756714d8e8SKurt Hackel ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); 10766714d8e8SKurt Hackel b = (mle->type == DLM_MLE_BLOCK); 10776714d8e8SKurt Hackel if ((*blocked && !b) || (!*blocked && b)) { 10786714d8e8SKurt Hackel mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 10796714d8e8SKurt Hackel dlm->name, res->lockname.len, res->lockname.name, 10806714d8e8SKurt Hackel *blocked, b); 10816714d8e8SKurt Hackel *blocked = b; 10826714d8e8SKurt Hackel } 10836714d8e8SKurt Hackel spin_unlock(&mle->spinlock); 10846714d8e8SKurt Hackel if (ret < 0) { 10856714d8e8SKurt Hackel mlog_errno(ret); 10866714d8e8SKurt Hackel goto leave; 10876714d8e8SKurt Hackel } 10886714d8e8SKurt Hackel mlog(0, "%s:%.*s: restart lock mastery succeeded, " 10896714d8e8SKurt Hackel "rechecking now\n", dlm->name, res->lockname.len, 10906714d8e8SKurt Hackel res->lockname.name); 10916714d8e8SKurt Hackel goto recheck; 1092aa852354SKurt Hackel } else { 1093aa852354SKurt Hackel if (!voting_done) { 1094aa852354SKurt Hackel mlog(0, "map not changed and voting not done " 1095aa852354SKurt Hackel "for %s:%.*s\n", dlm->name, res->lockname.len, 1096aa852354SKurt Hackel res->lockname.name); 1097aa852354SKurt Hackel } 10986714d8e8SKurt Hackel } 10996714d8e8SKurt Hackel 11006714d8e8SKurt Hackel if (m != O2NM_MAX_NODES) { 11016714d8e8SKurt Hackel /* another node has done an assert! 11026714d8e8SKurt Hackel * all done! */ 11036714d8e8SKurt Hackel sleep = 0; 11046714d8e8SKurt Hackel } else { 11056714d8e8SKurt Hackel sleep = 1; 11066714d8e8SKurt Hackel /* have all nodes responded? */ 11076714d8e8SKurt Hackel if (voting_done && !*blocked) { 11086714d8e8SKurt Hackel bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); 11096714d8e8SKurt Hackel if (dlm->node_num <= bit) { 11106714d8e8SKurt Hackel /* my node number is lowest. 11116714d8e8SKurt Hackel * now tell other nodes that I am 11126714d8e8SKurt Hackel * mastering this. */ 11136714d8e8SKurt Hackel mle->master = dlm->node_num; 1114ba2bf218SKurt Hackel /* ref was grabbed in get_lock_resource 1115ba2bf218SKurt Hackel * will be dropped in dlmlock_master */ 11166714d8e8SKurt Hackel assert = 1; 11176714d8e8SKurt Hackel sleep = 0; 11186714d8e8SKurt Hackel } 11196714d8e8SKurt Hackel /* if voting is done, but we have not received 11206714d8e8SKurt Hackel * an assert master yet, we must sleep */ 11216714d8e8SKurt Hackel } 11226714d8e8SKurt Hackel } 11236714d8e8SKurt Hackel 11246714d8e8SKurt Hackel spin_unlock(&mle->spinlock); 11256714d8e8SKurt Hackel 11266714d8e8SKurt Hackel /* sleep if we haven't finished voting yet */ 11276714d8e8SKurt Hackel if (sleep) { 11286714d8e8SKurt Hackel unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); 11296714d8e8SKurt Hackel 11306714d8e8SKurt Hackel /* 11316714d8e8SKurt Hackel if (atomic_read(&mle->mle_refs.refcount) < 2) 11326714d8e8SKurt Hackel mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, 11336714d8e8SKurt Hackel atomic_read(&mle->mle_refs.refcount), 11346714d8e8SKurt Hackel res->lockname.len, res->lockname.name); 11356714d8e8SKurt Hackel */ 11366714d8e8SKurt Hackel atomic_set(&mle->woken, 0); 11376714d8e8SKurt Hackel (void)wait_event_timeout(mle->wq, 11386714d8e8SKurt Hackel (atomic_read(&mle->woken) == 1), 11396714d8e8SKurt Hackel timeo); 11406714d8e8SKurt Hackel if (res->owner == O2NM_MAX_NODES) { 1141ba2bf218SKurt Hackel mlog(0, "%s:%.*s: waiting again\n", dlm->name, 1142ba2bf218SKurt Hackel res->lockname.len, res->lockname.name); 11436714d8e8SKurt Hackel goto recheck; 11446714d8e8SKurt Hackel } 11456714d8e8SKurt Hackel mlog(0, "done waiting, master is %u\n", res->owner); 11466714d8e8SKurt Hackel ret = 0; 11476714d8e8SKurt Hackel goto leave; 11486714d8e8SKurt Hackel } 11496714d8e8SKurt Hackel 11506714d8e8SKurt Hackel ret = 0; /* done */ 11516714d8e8SKurt Hackel if (assert) { 11526714d8e8SKurt Hackel m = dlm->node_num; 11536714d8e8SKurt Hackel mlog(0, "about to master %.*s here, this=%u\n", 11546714d8e8SKurt Hackel res->lockname.len, res->lockname.name, m); 1155ba2bf218SKurt Hackel ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); 11566714d8e8SKurt Hackel if (ret) { 11576714d8e8SKurt Hackel /* This is a failure in the network path, 11586714d8e8SKurt Hackel * not in the response to the assert_master 11596714d8e8SKurt Hackel * (any nonzero response is a BUG on this node). 11606714d8e8SKurt Hackel * Most likely a socket just got disconnected 11616714d8e8SKurt Hackel * due to node death. */ 11626714d8e8SKurt Hackel mlog_errno(ret); 11636714d8e8SKurt Hackel } 11646714d8e8SKurt Hackel /* no longer need to restart lock mastery. 11656714d8e8SKurt Hackel * all living nodes have been contacted. */ 11666714d8e8SKurt Hackel ret = 0; 11676714d8e8SKurt Hackel } 11686714d8e8SKurt Hackel 11696714d8e8SKurt Hackel /* set the lockres owner */ 11706714d8e8SKurt Hackel spin_lock(&res->spinlock); 1171ba2bf218SKurt Hackel /* mastery reference obtained either during 1172ba2bf218SKurt Hackel * assert_master_handler or in get_lock_resource */ 11736714d8e8SKurt Hackel dlm_change_lockres_owner(dlm, res, m); 11746714d8e8SKurt Hackel spin_unlock(&res->spinlock); 11756714d8e8SKurt Hackel 11766714d8e8SKurt Hackel leave: 11776714d8e8SKurt Hackel return ret; 11786714d8e8SKurt Hackel } 11796714d8e8SKurt Hackel 11806714d8e8SKurt Hackel struct dlm_bitmap_diff_iter 11816714d8e8SKurt Hackel { 11826714d8e8SKurt Hackel int curnode; 11836714d8e8SKurt Hackel unsigned long *orig_bm; 11846714d8e8SKurt Hackel unsigned long *cur_bm; 11856714d8e8SKurt Hackel unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; 11866714d8e8SKurt Hackel }; 11876714d8e8SKurt Hackel 11886714d8e8SKurt Hackel enum dlm_node_state_change 11896714d8e8SKurt Hackel { 11906714d8e8SKurt Hackel NODE_DOWN = -1, 11916714d8e8SKurt Hackel NODE_NO_CHANGE = 0, 11926714d8e8SKurt Hackel NODE_UP 11936714d8e8SKurt Hackel }; 11946714d8e8SKurt Hackel 11956714d8e8SKurt Hackel static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter, 11966714d8e8SKurt Hackel unsigned long *orig_bm, 11976714d8e8SKurt Hackel unsigned long *cur_bm) 11986714d8e8SKurt Hackel { 11996714d8e8SKurt Hackel unsigned long p1, p2; 12006714d8e8SKurt Hackel int i; 12016714d8e8SKurt Hackel 12026714d8e8SKurt Hackel iter->curnode = -1; 12036714d8e8SKurt Hackel iter->orig_bm = orig_bm; 12046714d8e8SKurt Hackel iter->cur_bm = cur_bm; 12056714d8e8SKurt Hackel 12066714d8e8SKurt Hackel for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) { 12076714d8e8SKurt Hackel p1 = *(iter->orig_bm + i); 12086714d8e8SKurt Hackel p2 = *(iter->cur_bm + i); 12096714d8e8SKurt Hackel iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1); 12106714d8e8SKurt Hackel } 12116714d8e8SKurt Hackel } 12126714d8e8SKurt Hackel 12136714d8e8SKurt Hackel static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter, 12146714d8e8SKurt Hackel enum dlm_node_state_change *state) 12156714d8e8SKurt Hackel { 12166714d8e8SKurt Hackel int bit; 12176714d8e8SKurt Hackel 12186714d8e8SKurt Hackel if (iter->curnode >= O2NM_MAX_NODES) 12196714d8e8SKurt Hackel return -ENOENT; 12206714d8e8SKurt Hackel 12216714d8e8SKurt Hackel bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES, 12226714d8e8SKurt Hackel iter->curnode+1); 12236714d8e8SKurt Hackel if (bit >= O2NM_MAX_NODES) { 12246714d8e8SKurt Hackel iter->curnode = O2NM_MAX_NODES; 12256714d8e8SKurt Hackel return -ENOENT; 12266714d8e8SKurt Hackel } 12276714d8e8SKurt Hackel 12286714d8e8SKurt Hackel /* if it was there in the original then this node died */ 12296714d8e8SKurt Hackel if (test_bit(bit, iter->orig_bm)) 12306714d8e8SKurt Hackel *state = NODE_DOWN; 12316714d8e8SKurt Hackel else 12326714d8e8SKurt Hackel *state = NODE_UP; 12336714d8e8SKurt Hackel 12346714d8e8SKurt Hackel iter->curnode = bit; 12356714d8e8SKurt Hackel return bit; 12366714d8e8SKurt Hackel } 12376714d8e8SKurt Hackel 12386714d8e8SKurt Hackel 12396714d8e8SKurt Hackel static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, 12406714d8e8SKurt Hackel struct dlm_lock_resource *res, 12416714d8e8SKurt Hackel struct dlm_master_list_entry *mle, 12426714d8e8SKurt Hackel int blocked) 12436714d8e8SKurt Hackel { 12446714d8e8SKurt Hackel struct dlm_bitmap_diff_iter bdi; 12456714d8e8SKurt Hackel enum dlm_node_state_change sc; 12466714d8e8SKurt Hackel int node; 12476714d8e8SKurt Hackel int ret = 0; 12486714d8e8SKurt Hackel 12496714d8e8SKurt Hackel mlog(0, "something happened such that the " 12506714d8e8SKurt Hackel "master process may need to be restarted!\n"); 12516714d8e8SKurt Hackel 12526714d8e8SKurt Hackel assert_spin_locked(&mle->spinlock); 12536714d8e8SKurt Hackel 12546714d8e8SKurt Hackel dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map); 12556714d8e8SKurt Hackel node = dlm_bitmap_diff_iter_next(&bdi, &sc); 12566714d8e8SKurt Hackel while (node >= 0) { 12576714d8e8SKurt Hackel if (sc == NODE_UP) { 1258e2faea4cSKurt Hackel /* a node came up. clear any old vote from 1259e2faea4cSKurt Hackel * the response map and set it in the vote map 1260e2faea4cSKurt Hackel * then restart the mastery. */ 1261e2faea4cSKurt Hackel mlog(ML_NOTICE, "node %d up while restarting\n", node); 12626714d8e8SKurt Hackel 12636714d8e8SKurt Hackel /* redo the master request, but only for the new node */ 12646714d8e8SKurt Hackel mlog(0, "sending request to new node\n"); 12656714d8e8SKurt Hackel clear_bit(node, mle->response_map); 12666714d8e8SKurt Hackel set_bit(node, mle->vote_map); 12676714d8e8SKurt Hackel } else { 12686714d8e8SKurt Hackel mlog(ML_ERROR, "node down! %d\n", node); 12696714d8e8SKurt Hackel if (blocked) { 12706714d8e8SKurt Hackel int lowest = find_next_bit(mle->maybe_map, 12716714d8e8SKurt Hackel O2NM_MAX_NODES, 0); 12726714d8e8SKurt Hackel 12736714d8e8SKurt Hackel /* act like it was never there */ 12746714d8e8SKurt Hackel clear_bit(node, mle->maybe_map); 12756714d8e8SKurt Hackel 1276e7e69eb3SKurt Hackel if (node == lowest) { 1277e7e69eb3SKurt Hackel mlog(0, "expected master %u died" 1278e7e69eb3SKurt Hackel " while this node was blocked " 1279e7e69eb3SKurt Hackel "waiting on it!\n", node); 12806714d8e8SKurt Hackel lowest = find_next_bit(mle->maybe_map, 12816714d8e8SKurt Hackel O2NM_MAX_NODES, 12826714d8e8SKurt Hackel lowest+1); 12836714d8e8SKurt Hackel if (lowest < O2NM_MAX_NODES) { 1284e7e69eb3SKurt Hackel mlog(0, "%s:%.*s:still " 1285e7e69eb3SKurt Hackel "blocked. waiting on %u " 1286e7e69eb3SKurt Hackel "now\n", dlm->name, 1287e7e69eb3SKurt Hackel res->lockname.len, 1288e7e69eb3SKurt Hackel res->lockname.name, 1289e7e69eb3SKurt Hackel lowest); 1290e7e69eb3SKurt Hackel } else { 1291e7e69eb3SKurt Hackel /* mle is an MLE_BLOCK, but 1292e7e69eb3SKurt Hackel * there is now nothing left to 1293e7e69eb3SKurt Hackel * block on. we need to return 1294e7e69eb3SKurt Hackel * all the way back out and try 1295e7e69eb3SKurt Hackel * again with an MLE_MASTER. 1296e7e69eb3SKurt Hackel * dlm_do_local_recovery_cleanup 1297e7e69eb3SKurt Hackel * has already run, so the mle 1298e7e69eb3SKurt Hackel * refcount is ok */ 1299e7e69eb3SKurt Hackel mlog(0, "%s:%.*s: no " 1300e7e69eb3SKurt Hackel "longer blocking. try to " 1301e7e69eb3SKurt Hackel "master this here\n", 1302e7e69eb3SKurt Hackel dlm->name, 1303e7e69eb3SKurt Hackel res->lockname.len, 1304e7e69eb3SKurt Hackel res->lockname.name); 1305e7e69eb3SKurt Hackel mle->type = DLM_MLE_MASTER; 1306f77a9a78SSunil Mushran mle->u.mleres = res; 1307e7e69eb3SKurt Hackel } 1308e7e69eb3SKurt Hackel } 13096714d8e8SKurt Hackel } 13106714d8e8SKurt Hackel 1311e7e69eb3SKurt Hackel /* now blank out everything, as if we had never 1312e7e69eb3SKurt Hackel * contacted anyone */ 1313e7e69eb3SKurt Hackel memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); 1314e7e69eb3SKurt Hackel memset(mle->response_map, 0, sizeof(mle->response_map)); 1315e7e69eb3SKurt Hackel /* reset the vote_map to the current node_map */ 13166714d8e8SKurt Hackel memcpy(mle->vote_map, mle->node_map, 13176714d8e8SKurt Hackel sizeof(mle->node_map)); 1318e7e69eb3SKurt Hackel /* put myself into the maybe map */ 1319e7e69eb3SKurt Hackel if (mle->type != DLM_MLE_BLOCK) 13206714d8e8SKurt Hackel set_bit(dlm->node_num, mle->maybe_map); 13216714d8e8SKurt Hackel } 13226714d8e8SKurt Hackel ret = -EAGAIN; 13236714d8e8SKurt Hackel node = dlm_bitmap_diff_iter_next(&bdi, &sc); 13246714d8e8SKurt Hackel } 13256714d8e8SKurt Hackel return ret; 13266714d8e8SKurt Hackel } 13276714d8e8SKurt Hackel 13286714d8e8SKurt Hackel 13296714d8e8SKurt Hackel /* 13306714d8e8SKurt Hackel * DLM_MASTER_REQUEST_MSG 13316714d8e8SKurt Hackel * 13326714d8e8SKurt Hackel * returns: 0 on success, 13336714d8e8SKurt Hackel * -errno on a network error 13346714d8e8SKurt Hackel * 13356714d8e8SKurt Hackel * on error, the caller should assume the target node is "dead" 13366714d8e8SKurt Hackel * 13376714d8e8SKurt Hackel */ 13386714d8e8SKurt Hackel 1339ba2bf218SKurt Hackel static int dlm_do_master_request(struct dlm_lock_resource *res, 1340ba2bf218SKurt Hackel struct dlm_master_list_entry *mle, int to) 13416714d8e8SKurt Hackel { 13426714d8e8SKurt Hackel struct dlm_ctxt *dlm = mle->dlm; 13436714d8e8SKurt Hackel struct dlm_master_request request; 13446714d8e8SKurt Hackel int ret, response=0, resend; 1345f77a9a78SSunil Mushran unsigned char *mlename; 1346f77a9a78SSunil Mushran unsigned int mlenamelen; 13476714d8e8SKurt Hackel 13486714d8e8SKurt Hackel memset(&request, 0, sizeof(request)); 13496714d8e8SKurt Hackel request.node_idx = dlm->node_num; 13506714d8e8SKurt Hackel 13516714d8e8SKurt Hackel BUG_ON(mle->type == DLM_MLE_MIGRATION); 13526714d8e8SKurt Hackel 13532ed6c750SSunil Mushran __dlm_mle_name(mle, &mlename, &mlenamelen, NULL); 1354f77a9a78SSunil Mushran 1355f77a9a78SSunil Mushran request.namelen = (u8)mlenamelen; 1356f77a9a78SSunil Mushran memcpy(request.name, mlename, request.namelen); 13576714d8e8SKurt Hackel 13586714d8e8SKurt Hackel again: 13596714d8e8SKurt Hackel ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, 13606714d8e8SKurt Hackel sizeof(request), to, &response); 13616714d8e8SKurt Hackel if (ret < 0) { 13626714d8e8SKurt Hackel if (ret == -ESRCH) { 13636714d8e8SKurt Hackel /* should never happen */ 13646714d8e8SKurt Hackel mlog(ML_ERROR, "TCP stack not ready!\n"); 13656714d8e8SKurt Hackel BUG(); 13666714d8e8SKurt Hackel } else if (ret == -EINVAL) { 13676714d8e8SKurt Hackel mlog(ML_ERROR, "bad args passed to o2net!\n"); 13686714d8e8SKurt Hackel BUG(); 13696714d8e8SKurt Hackel } else if (ret == -ENOMEM) { 13706714d8e8SKurt Hackel mlog(ML_ERROR, "out of memory while trying to send " 13716714d8e8SKurt Hackel "network message! retrying\n"); 13726714d8e8SKurt Hackel /* this is totally crude */ 13736714d8e8SKurt Hackel msleep(50); 13746714d8e8SKurt Hackel goto again; 13756714d8e8SKurt Hackel } else if (!dlm_is_host_down(ret)) { 13766714d8e8SKurt Hackel /* not a network error. bad. */ 13776714d8e8SKurt Hackel mlog_errno(ret); 13786714d8e8SKurt Hackel mlog(ML_ERROR, "unhandled error!"); 13796714d8e8SKurt Hackel BUG(); 13806714d8e8SKurt Hackel } 13816714d8e8SKurt Hackel /* all other errors should be network errors, 13826714d8e8SKurt Hackel * and likely indicate node death */ 13836714d8e8SKurt Hackel mlog(ML_ERROR, "link to %d went down!\n", to); 13846714d8e8SKurt Hackel goto out; 13856714d8e8SKurt Hackel } 13866714d8e8SKurt Hackel 13876714d8e8SKurt Hackel ret = 0; 13886714d8e8SKurt Hackel resend = 0; 13896714d8e8SKurt Hackel spin_lock(&mle->spinlock); 13906714d8e8SKurt Hackel switch (response) { 13916714d8e8SKurt Hackel case DLM_MASTER_RESP_YES: 13926714d8e8SKurt Hackel set_bit(to, mle->response_map); 13936714d8e8SKurt Hackel mlog(0, "node %u is the master, response=YES\n", to); 1394ba2bf218SKurt Hackel mlog(0, "%s:%.*s: master node %u now knows I have a " 1395ba2bf218SKurt Hackel "reference\n", dlm->name, res->lockname.len, 1396ba2bf218SKurt Hackel res->lockname.name, to); 13976714d8e8SKurt Hackel mle->master = to; 13986714d8e8SKurt Hackel break; 13996714d8e8SKurt Hackel case DLM_MASTER_RESP_NO: 14006714d8e8SKurt Hackel mlog(0, "node %u not master, response=NO\n", to); 14016714d8e8SKurt Hackel set_bit(to, mle->response_map); 14026714d8e8SKurt Hackel break; 14036714d8e8SKurt Hackel case DLM_MASTER_RESP_MAYBE: 14046714d8e8SKurt Hackel mlog(0, "node %u not master, response=MAYBE\n", to); 14056714d8e8SKurt Hackel set_bit(to, mle->response_map); 14066714d8e8SKurt Hackel set_bit(to, mle->maybe_map); 14076714d8e8SKurt Hackel break; 14086714d8e8SKurt Hackel case DLM_MASTER_RESP_ERROR: 14096714d8e8SKurt Hackel mlog(0, "node %u hit an error, resending\n", to); 14106714d8e8SKurt Hackel resend = 1; 14116714d8e8SKurt Hackel response = 0; 14126714d8e8SKurt Hackel break; 14136714d8e8SKurt Hackel default: 14146714d8e8SKurt Hackel mlog(ML_ERROR, "bad response! %u\n", response); 14156714d8e8SKurt Hackel BUG(); 14166714d8e8SKurt Hackel } 14176714d8e8SKurt Hackel spin_unlock(&mle->spinlock); 14186714d8e8SKurt Hackel if (resend) { 14196714d8e8SKurt Hackel /* this is also totally crude */ 14206714d8e8SKurt Hackel msleep(50); 14216714d8e8SKurt Hackel goto again; 14226714d8e8SKurt Hackel } 14236714d8e8SKurt Hackel 14246714d8e8SKurt Hackel out: 14256714d8e8SKurt Hackel return ret; 14266714d8e8SKurt Hackel } 14276714d8e8SKurt Hackel 14286714d8e8SKurt Hackel /* 14296714d8e8SKurt Hackel * locks that can be taken here: 14306714d8e8SKurt Hackel * dlm->spinlock 14316714d8e8SKurt Hackel * res->spinlock 14326714d8e8SKurt Hackel * mle->spinlock 14336714d8e8SKurt Hackel * dlm->master_list 14346714d8e8SKurt Hackel * 14356714d8e8SKurt Hackel * if possible, TRIM THIS DOWN!!! 14366714d8e8SKurt Hackel */ 1437d74c9803SKurt Hackel int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, 1438d74c9803SKurt Hackel void **ret_data) 14396714d8e8SKurt Hackel { 14406714d8e8SKurt Hackel u8 response = DLM_MASTER_RESP_MAYBE; 14416714d8e8SKurt Hackel struct dlm_ctxt *dlm = data; 14429c6510a5SKurt Hackel struct dlm_lock_resource *res = NULL; 14436714d8e8SKurt Hackel struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; 14446714d8e8SKurt Hackel struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; 14456714d8e8SKurt Hackel char *name; 1446a3d33291SMark Fasheh unsigned int namelen, hash; 14476714d8e8SKurt Hackel int found, ret; 14486714d8e8SKurt Hackel int set_maybe; 14499c6510a5SKurt Hackel int dispatch_assert = 0; 14506714d8e8SKurt Hackel 14516714d8e8SKurt Hackel if (!dlm_grab(dlm)) 14526714d8e8SKurt Hackel return DLM_MASTER_RESP_NO; 14536714d8e8SKurt Hackel 14546714d8e8SKurt Hackel if (!dlm_domain_fully_joined(dlm)) { 14556714d8e8SKurt Hackel response = DLM_MASTER_RESP_NO; 14566714d8e8SKurt Hackel goto send_response; 14576714d8e8SKurt Hackel } 14586714d8e8SKurt Hackel 14596714d8e8SKurt Hackel name = request->name; 14606714d8e8SKurt Hackel namelen = request->namelen; 1461a3d33291SMark Fasheh hash = dlm_lockid_hash(name, namelen); 14626714d8e8SKurt Hackel 14636714d8e8SKurt Hackel if (namelen > DLM_LOCKID_NAME_MAX) { 14646714d8e8SKurt Hackel response = DLM_IVBUFLEN; 14656714d8e8SKurt Hackel goto send_response; 14666714d8e8SKurt Hackel } 14676714d8e8SKurt Hackel 14686714d8e8SKurt Hackel way_up_top: 14696714d8e8SKurt Hackel spin_lock(&dlm->spinlock); 1470a3d33291SMark Fasheh res = __dlm_lookup_lockres(dlm, name, namelen, hash); 14716714d8e8SKurt Hackel if (res) { 14726714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 14736714d8e8SKurt Hackel 14746714d8e8SKurt Hackel /* take care of the easy cases up front */ 14756714d8e8SKurt Hackel spin_lock(&res->spinlock); 14761cd04dbeSKurt Hackel if (res->state & (DLM_LOCK_RES_RECOVERING| 14771cd04dbeSKurt Hackel DLM_LOCK_RES_MIGRATING)) { 14786714d8e8SKurt Hackel spin_unlock(&res->spinlock); 14796714d8e8SKurt Hackel mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " 14801cd04dbeSKurt Hackel "being recovered/migrated\n"); 14816714d8e8SKurt Hackel response = DLM_MASTER_RESP_ERROR; 14826714d8e8SKurt Hackel if (mle) 14836714d8e8SKurt Hackel kmem_cache_free(dlm_mle_cache, mle); 14846714d8e8SKurt Hackel goto send_response; 14856714d8e8SKurt Hackel } 14866714d8e8SKurt Hackel 14876714d8e8SKurt Hackel if (res->owner == dlm->node_num) { 1488ba2bf218SKurt Hackel mlog(0, "%s:%.*s: setting bit %u in refmap\n", 1489ba2bf218SKurt Hackel dlm->name, namelen, name, request->node_idx); 1490ba2bf218SKurt Hackel dlm_lockres_set_refmap_bit(request->node_idx, res); 14916714d8e8SKurt Hackel spin_unlock(&res->spinlock); 14926714d8e8SKurt Hackel response = DLM_MASTER_RESP_YES; 14936714d8e8SKurt Hackel if (mle) 14946714d8e8SKurt Hackel kmem_cache_free(dlm_mle_cache, mle); 14956714d8e8SKurt Hackel 14966714d8e8SKurt Hackel /* this node is the owner. 14976714d8e8SKurt Hackel * there is some extra work that needs to 14986714d8e8SKurt Hackel * happen now. the requesting node has 14996714d8e8SKurt Hackel * caused all nodes up to this one to 15006714d8e8SKurt Hackel * create mles. this node now needs to 15016714d8e8SKurt Hackel * go back and clean those up. */ 15029c6510a5SKurt Hackel dispatch_assert = 1; 15036714d8e8SKurt Hackel goto send_response; 15046714d8e8SKurt Hackel } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { 15056714d8e8SKurt Hackel spin_unlock(&res->spinlock); 15066714d8e8SKurt Hackel // mlog(0, "node %u is the master\n", res->owner); 15076714d8e8SKurt Hackel response = DLM_MASTER_RESP_NO; 15086714d8e8SKurt Hackel if (mle) 15096714d8e8SKurt Hackel kmem_cache_free(dlm_mle_cache, mle); 15106714d8e8SKurt Hackel goto send_response; 15116714d8e8SKurt Hackel } 15126714d8e8SKurt Hackel 15136714d8e8SKurt Hackel /* ok, there is no owner. either this node is 15146714d8e8SKurt Hackel * being blocked, or it is actively trying to 15156714d8e8SKurt Hackel * master this lock. */ 15166714d8e8SKurt Hackel if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { 15176714d8e8SKurt Hackel mlog(ML_ERROR, "lock with no owner should be " 15186714d8e8SKurt Hackel "in-progress!\n"); 15196714d8e8SKurt Hackel BUG(); 15206714d8e8SKurt Hackel } 15216714d8e8SKurt Hackel 15226714d8e8SKurt Hackel // mlog(0, "lockres is in progress...\n"); 15236714d8e8SKurt Hackel spin_lock(&dlm->master_lock); 15246714d8e8SKurt Hackel found = dlm_find_mle(dlm, &tmpmle, name, namelen); 15256714d8e8SKurt Hackel if (!found) { 15266714d8e8SKurt Hackel mlog(ML_ERROR, "no mle found for this lock!\n"); 15276714d8e8SKurt Hackel BUG(); 15286714d8e8SKurt Hackel } 15296714d8e8SKurt Hackel set_maybe = 1; 15306714d8e8SKurt Hackel spin_lock(&tmpmle->spinlock); 15316714d8e8SKurt Hackel if (tmpmle->type == DLM_MLE_BLOCK) { 15326714d8e8SKurt Hackel // mlog(0, "this node is waiting for " 15336714d8e8SKurt Hackel // "lockres to be mastered\n"); 15346714d8e8SKurt Hackel response = DLM_MASTER_RESP_NO; 15356714d8e8SKurt Hackel } else if (tmpmle->type == DLM_MLE_MIGRATION) { 15366714d8e8SKurt Hackel mlog(0, "node %u is master, but trying to migrate to " 15376714d8e8SKurt Hackel "node %u.\n", tmpmle->master, tmpmle->new_master); 15386714d8e8SKurt Hackel if (tmpmle->master == dlm->node_num) { 15396714d8e8SKurt Hackel mlog(ML_ERROR, "no owner on lockres, but this " 15406714d8e8SKurt Hackel "node is trying to migrate it to %u?!\n", 15416714d8e8SKurt Hackel tmpmle->new_master); 15426714d8e8SKurt Hackel BUG(); 15436714d8e8SKurt Hackel } else { 15446714d8e8SKurt Hackel /* the real master can respond on its own */ 15456714d8e8SKurt Hackel response = DLM_MASTER_RESP_NO; 15466714d8e8SKurt Hackel } 15476714d8e8SKurt Hackel } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) { 15486714d8e8SKurt Hackel set_maybe = 0; 15499c6510a5SKurt Hackel if (tmpmle->master == dlm->node_num) { 15506714d8e8SKurt Hackel response = DLM_MASTER_RESP_YES; 15519c6510a5SKurt Hackel /* this node will be the owner. 15529c6510a5SKurt Hackel * go back and clean the mles on any 15539c6510a5SKurt Hackel * other nodes */ 15549c6510a5SKurt Hackel dispatch_assert = 1; 1555ba2bf218SKurt Hackel dlm_lockres_set_refmap_bit(request->node_idx, res); 1556ba2bf218SKurt Hackel mlog(0, "%s:%.*s: setting bit %u in refmap\n", 1557ba2bf218SKurt Hackel dlm->name, namelen, name, 1558ba2bf218SKurt Hackel request->node_idx); 15599c6510a5SKurt Hackel } else 15606714d8e8SKurt Hackel response = DLM_MASTER_RESP_NO; 15616714d8e8SKurt Hackel } else { 15626714d8e8SKurt Hackel // mlog(0, "this node is attempting to " 15636714d8e8SKurt Hackel // "master lockres\n"); 15646714d8e8SKurt Hackel response = DLM_MASTER_RESP_MAYBE; 15656714d8e8SKurt Hackel } 15666714d8e8SKurt Hackel if (set_maybe) 15676714d8e8SKurt Hackel set_bit(request->node_idx, tmpmle->maybe_map); 15686714d8e8SKurt Hackel spin_unlock(&tmpmle->spinlock); 15696714d8e8SKurt Hackel 15706714d8e8SKurt Hackel spin_unlock(&dlm->master_lock); 15716714d8e8SKurt Hackel spin_unlock(&res->spinlock); 15726714d8e8SKurt Hackel 15736714d8e8SKurt Hackel /* keep the mle attached to heartbeat events */ 15746714d8e8SKurt Hackel dlm_put_mle(tmpmle); 15756714d8e8SKurt Hackel if (mle) 15766714d8e8SKurt Hackel kmem_cache_free(dlm_mle_cache, mle); 15776714d8e8SKurt Hackel goto send_response; 15786714d8e8SKurt Hackel } 15796714d8e8SKurt Hackel 15806714d8e8SKurt Hackel /* 15816714d8e8SKurt Hackel * lockres doesn't exist on this node 15826714d8e8SKurt Hackel * if there is an MLE_BLOCK, return NO 15836714d8e8SKurt Hackel * if there is an MLE_MASTER, return MAYBE 15846714d8e8SKurt Hackel * otherwise, add an MLE_BLOCK, return NO 15856714d8e8SKurt Hackel */ 15866714d8e8SKurt Hackel spin_lock(&dlm->master_lock); 15876714d8e8SKurt Hackel found = dlm_find_mle(dlm, &tmpmle, name, namelen); 15886714d8e8SKurt Hackel if (!found) { 15896714d8e8SKurt Hackel /* this lockid has never been seen on this node yet */ 15906714d8e8SKurt Hackel // mlog(0, "no mle found\n"); 15916714d8e8SKurt Hackel if (!mle) { 15926714d8e8SKurt Hackel spin_unlock(&dlm->master_lock); 15936714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 15946714d8e8SKurt Hackel 15956714d8e8SKurt Hackel mle = (struct dlm_master_list_entry *) 1596ad8100e0SKurt Hackel kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); 15976714d8e8SKurt Hackel if (!mle) { 15986714d8e8SKurt Hackel response = DLM_MASTER_RESP_ERROR; 15999c6510a5SKurt Hackel mlog_errno(-ENOMEM); 16006714d8e8SKurt Hackel goto send_response; 16016714d8e8SKurt Hackel } 16026714d8e8SKurt Hackel goto way_up_top; 16036714d8e8SKurt Hackel } 16046714d8e8SKurt Hackel 16056714d8e8SKurt Hackel // mlog(0, "this is second time thru, already allocated, " 16066714d8e8SKurt Hackel // "add the block.\n"); 160741b8c8a1SKurt Hackel dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); 16086714d8e8SKurt Hackel set_bit(request->node_idx, mle->maybe_map); 16091c084577SSunil Mushran __dlm_insert_mle(dlm, mle); 16106714d8e8SKurt Hackel response = DLM_MASTER_RESP_NO; 16116714d8e8SKurt Hackel } else { 16126714d8e8SKurt Hackel // mlog(0, "mle was found\n"); 16136714d8e8SKurt Hackel set_maybe = 1; 16146714d8e8SKurt Hackel spin_lock(&tmpmle->spinlock); 16159c6510a5SKurt Hackel if (tmpmle->master == dlm->node_num) { 16169c6510a5SKurt Hackel mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); 16179c6510a5SKurt Hackel BUG(); 16189c6510a5SKurt Hackel } 16196714d8e8SKurt Hackel if (tmpmle->type == DLM_MLE_BLOCK) 16206714d8e8SKurt Hackel response = DLM_MASTER_RESP_NO; 16216714d8e8SKurt Hackel else if (tmpmle->type == DLM_MLE_MIGRATION) { 16226714d8e8SKurt Hackel mlog(0, "migration mle was found (%u->%u)\n", 16236714d8e8SKurt Hackel tmpmle->master, tmpmle->new_master); 16246714d8e8SKurt Hackel /* real master can respond on its own */ 16256714d8e8SKurt Hackel response = DLM_MASTER_RESP_NO; 16266714d8e8SKurt Hackel } else 16276714d8e8SKurt Hackel response = DLM_MASTER_RESP_MAYBE; 16286714d8e8SKurt Hackel if (set_maybe) 16296714d8e8SKurt Hackel set_bit(request->node_idx, tmpmle->maybe_map); 16306714d8e8SKurt Hackel spin_unlock(&tmpmle->spinlock); 16316714d8e8SKurt Hackel } 16326714d8e8SKurt Hackel spin_unlock(&dlm->master_lock); 16336714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 16346714d8e8SKurt Hackel 16356714d8e8SKurt Hackel if (found) { 16366714d8e8SKurt Hackel /* keep the mle attached to heartbeat events */ 16376714d8e8SKurt Hackel dlm_put_mle(tmpmle); 16386714d8e8SKurt Hackel } 16396714d8e8SKurt Hackel send_response: 1640b31cfc02SSunil Mushran /* 1641b31cfc02SSunil Mushran * __dlm_lookup_lockres() grabbed a reference to this lockres. 1642b31cfc02SSunil Mushran * The reference is released by dlm_assert_master_worker() under 1643b31cfc02SSunil Mushran * the call to dlm_dispatch_assert_master(). If 1644b31cfc02SSunil Mushran * dlm_assert_master_worker() isn't called, we drop it here. 1645b31cfc02SSunil Mushran */ 16469c6510a5SKurt Hackel if (dispatch_assert) { 16479c6510a5SKurt Hackel if (response != DLM_MASTER_RESP_YES) 16489c6510a5SKurt Hackel mlog(ML_ERROR, "invalid response %d\n", response); 16499c6510a5SKurt Hackel if (!res) { 16509c6510a5SKurt Hackel mlog(ML_ERROR, "bad lockres while trying to assert!\n"); 16519c6510a5SKurt Hackel BUG(); 16529c6510a5SKurt Hackel } 16539c6510a5SKurt Hackel mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", 16549c6510a5SKurt Hackel dlm->node_num, res->lockname.len, res->lockname.name); 16559c6510a5SKurt Hackel ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, 16569c6510a5SKurt Hackel DLM_ASSERT_MASTER_MLE_CLEANUP); 16579c6510a5SKurt Hackel if (ret < 0) { 16589c6510a5SKurt Hackel mlog(ML_ERROR, "failed to dispatch assert master work\n"); 16599c6510a5SKurt Hackel response = DLM_MASTER_RESP_ERROR; 1660b31cfc02SSunil Mushran dlm_lockres_put(res); 16619c6510a5SKurt Hackel } 1662b31cfc02SSunil Mushran } else { 1663b31cfc02SSunil Mushran if (res) 1664b31cfc02SSunil Mushran dlm_lockres_put(res); 16659c6510a5SKurt Hackel } 16669c6510a5SKurt Hackel 16676714d8e8SKurt Hackel dlm_put(dlm); 16686714d8e8SKurt Hackel return response; 16696714d8e8SKurt Hackel } 16706714d8e8SKurt Hackel 16716714d8e8SKurt Hackel /* 16726714d8e8SKurt Hackel * DLM_ASSERT_MASTER_MSG 16736714d8e8SKurt Hackel */ 16746714d8e8SKurt Hackel 16756714d8e8SKurt Hackel 16766714d8e8SKurt Hackel /* 16776714d8e8SKurt Hackel * NOTE: this can be used for debugging 16786714d8e8SKurt Hackel * can periodically run all locks owned by this node 16796714d8e8SKurt Hackel * and re-assert across the cluster... 16806714d8e8SKurt Hackel */ 168105488bbeSAdrian Bunk static int dlm_do_assert_master(struct dlm_ctxt *dlm, 1682ba2bf218SKurt Hackel struct dlm_lock_resource *res, 1683ba2bf218SKurt Hackel void *nodemap, u32 flags) 16846714d8e8SKurt Hackel { 16856714d8e8SKurt Hackel struct dlm_assert_master assert; 16866714d8e8SKurt Hackel int to, tmpret; 16876714d8e8SKurt Hackel struct dlm_node_iter iter; 16886714d8e8SKurt Hackel int ret = 0; 16899c6510a5SKurt Hackel int reassert; 1690ba2bf218SKurt Hackel const char *lockname = res->lockname.name; 1691ba2bf218SKurt Hackel unsigned int namelen = res->lockname.len; 16926714d8e8SKurt Hackel 16936714d8e8SKurt Hackel BUG_ON(namelen > O2NM_MAX_NAME_LEN); 1694f3f85464SSunil Mushran 1695f3f85464SSunil Mushran spin_lock(&res->spinlock); 1696f3f85464SSunil Mushran res->state |= DLM_LOCK_RES_SETREF_INPROG; 1697f3f85464SSunil Mushran spin_unlock(&res->spinlock); 1698f3f85464SSunil Mushran 16999c6510a5SKurt Hackel again: 17009c6510a5SKurt Hackel reassert = 0; 17016714d8e8SKurt Hackel 17026714d8e8SKurt Hackel /* note that if this nodemap is empty, it returns 0 */ 17036714d8e8SKurt Hackel dlm_node_iter_init(nodemap, &iter); 17046714d8e8SKurt Hackel while ((to = dlm_node_iter_next(&iter)) >= 0) { 17056714d8e8SKurt Hackel int r = 0; 1706a9ee4c8aSKurt Hackel struct dlm_master_list_entry *mle = NULL; 1707a9ee4c8aSKurt Hackel 17086714d8e8SKurt Hackel mlog(0, "sending assert master to %d (%.*s)\n", to, 17096714d8e8SKurt Hackel namelen, lockname); 17106714d8e8SKurt Hackel memset(&assert, 0, sizeof(assert)); 17116714d8e8SKurt Hackel assert.node_idx = dlm->node_num; 17126714d8e8SKurt Hackel assert.namelen = namelen; 17136714d8e8SKurt Hackel memcpy(assert.name, lockname, namelen); 17146714d8e8SKurt Hackel assert.flags = cpu_to_be32(flags); 17156714d8e8SKurt Hackel 17166714d8e8SKurt Hackel tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, 17176714d8e8SKurt Hackel &assert, sizeof(assert), to, &r); 17186714d8e8SKurt Hackel if (tmpret < 0) { 17193b3b84a8SKurt Hackel mlog(0, "assert_master returned %d!\n", tmpret); 17206714d8e8SKurt Hackel if (!dlm_is_host_down(tmpret)) { 17213b3b84a8SKurt Hackel mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); 17226714d8e8SKurt Hackel BUG(); 17236714d8e8SKurt Hackel } 17246714d8e8SKurt Hackel /* a node died. finish out the rest of the nodes. */ 17253b3b84a8SKurt Hackel mlog(0, "link to %d went down!\n", to); 17266714d8e8SKurt Hackel /* any nonzero status return will do */ 17276714d8e8SKurt Hackel ret = tmpret; 1728ba2bf218SKurt Hackel r = 0; 17296714d8e8SKurt Hackel } else if (r < 0) { 17306714d8e8SKurt Hackel /* ok, something horribly messed. kill thyself. */ 17316714d8e8SKurt Hackel mlog(ML_ERROR,"during assert master of %.*s to %u, " 17326714d8e8SKurt Hackel "got %d.\n", namelen, lockname, to, r); 1733a9ee4c8aSKurt Hackel spin_lock(&dlm->spinlock); 1734a9ee4c8aSKurt Hackel spin_lock(&dlm->master_lock); 1735a9ee4c8aSKurt Hackel if (dlm_find_mle(dlm, &mle, (char *)lockname, 1736a9ee4c8aSKurt Hackel namelen)) { 1737a9ee4c8aSKurt Hackel dlm_print_one_mle(mle); 1738a9ee4c8aSKurt Hackel __dlm_put_mle(mle); 1739a9ee4c8aSKurt Hackel } 1740a9ee4c8aSKurt Hackel spin_unlock(&dlm->master_lock); 1741a9ee4c8aSKurt Hackel spin_unlock(&dlm->spinlock); 17426714d8e8SKurt Hackel BUG(); 1743ba2bf218SKurt Hackel } 1744ba2bf218SKurt Hackel 1745ba2bf218SKurt Hackel if (r & DLM_ASSERT_RESPONSE_REASSERT && 1746ba2bf218SKurt Hackel !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { 1747ba2bf218SKurt Hackel mlog(ML_ERROR, "%.*s: very strange, " 1748ba2bf218SKurt Hackel "master MLE but no lockres on %u\n", 1749ba2bf218SKurt Hackel namelen, lockname, to); 1750ba2bf218SKurt Hackel } 1751ba2bf218SKurt Hackel 1752ba2bf218SKurt Hackel if (r & DLM_ASSERT_RESPONSE_REASSERT) { 17539c6510a5SKurt Hackel mlog(0, "%.*s: node %u create mles on other " 17549c6510a5SKurt Hackel "nodes and requests a re-assert\n", 17559c6510a5SKurt Hackel namelen, lockname, to); 17569c6510a5SKurt Hackel reassert = 1; 17576714d8e8SKurt Hackel } 1758ba2bf218SKurt Hackel if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { 1759ba2bf218SKurt Hackel mlog(0, "%.*s: node %u has a reference to this " 1760ba2bf218SKurt Hackel "lockres, set the bit in the refmap\n", 1761ba2bf218SKurt Hackel namelen, lockname, to); 1762ba2bf218SKurt Hackel spin_lock(&res->spinlock); 1763ba2bf218SKurt Hackel dlm_lockres_set_refmap_bit(to, res); 1764ba2bf218SKurt Hackel spin_unlock(&res->spinlock); 1765ba2bf218SKurt Hackel } 17666714d8e8SKurt Hackel } 17676714d8e8SKurt Hackel 17689c6510a5SKurt Hackel if (reassert) 17699c6510a5SKurt Hackel goto again; 17709c6510a5SKurt Hackel 1771f3f85464SSunil Mushran spin_lock(&res->spinlock); 1772f3f85464SSunil Mushran res->state &= ~DLM_LOCK_RES_SETREF_INPROG; 1773f3f85464SSunil Mushran spin_unlock(&res->spinlock); 1774f3f85464SSunil Mushran wake_up(&res->wq); 1775f3f85464SSunil Mushran 17766714d8e8SKurt Hackel return ret; 17776714d8e8SKurt Hackel } 17786714d8e8SKurt Hackel 17796714d8e8SKurt Hackel /* 17806714d8e8SKurt Hackel * locks that can be taken here: 17816714d8e8SKurt Hackel * dlm->spinlock 17826714d8e8SKurt Hackel * res->spinlock 17836714d8e8SKurt Hackel * mle->spinlock 17846714d8e8SKurt Hackel * dlm->master_list 17856714d8e8SKurt Hackel * 17866714d8e8SKurt Hackel * if possible, TRIM THIS DOWN!!! 17876714d8e8SKurt Hackel */ 1788d74c9803SKurt Hackel int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, 1789d74c9803SKurt Hackel void **ret_data) 17906714d8e8SKurt Hackel { 17916714d8e8SKurt Hackel struct dlm_ctxt *dlm = data; 17926714d8e8SKurt Hackel struct dlm_master_list_entry *mle = NULL; 17936714d8e8SKurt Hackel struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; 17946714d8e8SKurt Hackel struct dlm_lock_resource *res = NULL; 17956714d8e8SKurt Hackel char *name; 1796a3d33291SMark Fasheh unsigned int namelen, hash; 17976714d8e8SKurt Hackel u32 flags; 1798ba2bf218SKurt Hackel int master_request = 0, have_lockres_ref = 0; 17999c6510a5SKurt Hackel int ret = 0; 18006714d8e8SKurt Hackel 18016714d8e8SKurt Hackel if (!dlm_grab(dlm)) 18026714d8e8SKurt Hackel return 0; 18036714d8e8SKurt Hackel 18046714d8e8SKurt Hackel name = assert->name; 18056714d8e8SKurt Hackel namelen = assert->namelen; 1806a3d33291SMark Fasheh hash = dlm_lockid_hash(name, namelen); 18076714d8e8SKurt Hackel flags = be32_to_cpu(assert->flags); 18086714d8e8SKurt Hackel 18096714d8e8SKurt Hackel if (namelen > DLM_LOCKID_NAME_MAX) { 18106714d8e8SKurt Hackel mlog(ML_ERROR, "Invalid name length!"); 18116714d8e8SKurt Hackel goto done; 18126714d8e8SKurt Hackel } 18136714d8e8SKurt Hackel 18146714d8e8SKurt Hackel spin_lock(&dlm->spinlock); 18156714d8e8SKurt Hackel 18166714d8e8SKurt Hackel if (flags) 18176714d8e8SKurt Hackel mlog(0, "assert_master with flags: %u\n", flags); 18186714d8e8SKurt Hackel 18196714d8e8SKurt Hackel /* find the MLE */ 18206714d8e8SKurt Hackel spin_lock(&dlm->master_lock); 18216714d8e8SKurt Hackel if (!dlm_find_mle(dlm, &mle, name, namelen)) { 18226714d8e8SKurt Hackel /* not an error, could be master just re-asserting */ 18236714d8e8SKurt Hackel mlog(0, "just got an assert_master from %u, but no " 18246714d8e8SKurt Hackel "MLE for it! (%.*s)\n", assert->node_idx, 18256714d8e8SKurt Hackel namelen, name); 18266714d8e8SKurt Hackel } else { 18276714d8e8SKurt Hackel int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); 18286714d8e8SKurt Hackel if (bit >= O2NM_MAX_NODES) { 18296714d8e8SKurt Hackel /* not necessarily an error, though less likely. 18306714d8e8SKurt Hackel * could be master just re-asserting. */ 1831aa852354SKurt Hackel mlog(0, "no bits set in the maybe_map, but %u " 18326714d8e8SKurt Hackel "is asserting! (%.*s)\n", assert->node_idx, 18336714d8e8SKurt Hackel namelen, name); 18346714d8e8SKurt Hackel } else if (bit != assert->node_idx) { 18356714d8e8SKurt Hackel if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { 18366714d8e8SKurt Hackel mlog(0, "master %u was found, %u should " 18376714d8e8SKurt Hackel "back off\n", assert->node_idx, bit); 18386714d8e8SKurt Hackel } else { 18396714d8e8SKurt Hackel /* with the fix for bug 569, a higher node 18406714d8e8SKurt Hackel * number winning the mastery will respond 18416714d8e8SKurt Hackel * YES to mastery requests, but this node 18426714d8e8SKurt Hackel * had no way of knowing. let it pass. */ 1843aa852354SKurt Hackel mlog(0, "%u is the lowest node, " 18446714d8e8SKurt Hackel "%u is asserting. (%.*s) %u must " 18456714d8e8SKurt Hackel "have begun after %u won.\n", bit, 18466714d8e8SKurt Hackel assert->node_idx, namelen, name, bit, 18476714d8e8SKurt Hackel assert->node_idx); 18486714d8e8SKurt Hackel } 18496714d8e8SKurt Hackel } 18502d1a868cSKurt Hackel if (mle->type == DLM_MLE_MIGRATION) { 18512d1a868cSKurt Hackel if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { 18522d1a868cSKurt Hackel mlog(0, "%s:%.*s: got cleanup assert" 18532d1a868cSKurt Hackel " from %u for migration\n", 18542d1a868cSKurt Hackel dlm->name, namelen, name, 18552d1a868cSKurt Hackel assert->node_idx); 18562d1a868cSKurt Hackel } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) { 18572d1a868cSKurt Hackel mlog(0, "%s:%.*s: got unrelated assert" 18582d1a868cSKurt Hackel " from %u for migration, ignoring\n", 18592d1a868cSKurt Hackel dlm->name, namelen, name, 18602d1a868cSKurt Hackel assert->node_idx); 18612d1a868cSKurt Hackel __dlm_put_mle(mle); 18622d1a868cSKurt Hackel spin_unlock(&dlm->master_lock); 18632d1a868cSKurt Hackel spin_unlock(&dlm->spinlock); 18642d1a868cSKurt Hackel goto done; 18652d1a868cSKurt Hackel } 18662d1a868cSKurt Hackel } 18676714d8e8SKurt Hackel } 18686714d8e8SKurt Hackel spin_unlock(&dlm->master_lock); 18696714d8e8SKurt Hackel 18706714d8e8SKurt Hackel /* ok everything checks out with the MLE 18716714d8e8SKurt Hackel * now check to see if there is a lockres */ 1872a3d33291SMark Fasheh res = __dlm_lookup_lockres(dlm, name, namelen, hash); 18736714d8e8SKurt Hackel if (res) { 18746714d8e8SKurt Hackel spin_lock(&res->spinlock); 18756714d8e8SKurt Hackel if (res->state & DLM_LOCK_RES_RECOVERING) { 18766714d8e8SKurt Hackel mlog(ML_ERROR, "%u asserting but %.*s is " 18776714d8e8SKurt Hackel "RECOVERING!\n", assert->node_idx, namelen, name); 18786714d8e8SKurt Hackel goto kill; 18796714d8e8SKurt Hackel } 18806714d8e8SKurt Hackel if (!mle) { 1881dc2ed195SKurt Hackel if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && 1882dc2ed195SKurt Hackel res->owner != assert->node_idx) { 188353ecd25eSSunil Mushran mlog(ML_ERROR, "DIE! Mastery assert from %u, " 188453ecd25eSSunil Mushran "but current owner is %u! (%.*s)\n", 188553ecd25eSSunil Mushran assert->node_idx, res->owner, namelen, 188653ecd25eSSunil Mushran name); 188753ecd25eSSunil Mushran __dlm_print_one_lock_resource(res); 188853ecd25eSSunil Mushran BUG(); 18896714d8e8SKurt Hackel } 18906714d8e8SKurt Hackel } else if (mle->type != DLM_MLE_MIGRATION) { 18916714d8e8SKurt Hackel if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { 18926714d8e8SKurt Hackel /* owner is just re-asserting */ 18936714d8e8SKurt Hackel if (res->owner == assert->node_idx) { 18946714d8e8SKurt Hackel mlog(0, "owner %u re-asserting on " 18956714d8e8SKurt Hackel "lock %.*s\n", assert->node_idx, 18966714d8e8SKurt Hackel namelen, name); 18976714d8e8SKurt Hackel goto ok; 18986714d8e8SKurt Hackel } 18996714d8e8SKurt Hackel mlog(ML_ERROR, "got assert_master from " 19006714d8e8SKurt Hackel "node %u, but %u is the owner! " 19016714d8e8SKurt Hackel "(%.*s)\n", assert->node_idx, 19026714d8e8SKurt Hackel res->owner, namelen, name); 19036714d8e8SKurt Hackel goto kill; 19046714d8e8SKurt Hackel } 19056714d8e8SKurt Hackel if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { 19066714d8e8SKurt Hackel mlog(ML_ERROR, "got assert from %u, but lock " 19076714d8e8SKurt Hackel "with no owner should be " 19086714d8e8SKurt Hackel "in-progress! (%.*s)\n", 19096714d8e8SKurt Hackel assert->node_idx, 19106714d8e8SKurt Hackel namelen, name); 19116714d8e8SKurt Hackel goto kill; 19126714d8e8SKurt Hackel } 19136714d8e8SKurt Hackel } else /* mle->type == DLM_MLE_MIGRATION */ { 19146714d8e8SKurt Hackel /* should only be getting an assert from new master */ 19156714d8e8SKurt Hackel if (assert->node_idx != mle->new_master) { 19166714d8e8SKurt Hackel mlog(ML_ERROR, "got assert from %u, but " 19176714d8e8SKurt Hackel "new master is %u, and old master " 19186714d8e8SKurt Hackel "was %u (%.*s)\n", 19196714d8e8SKurt Hackel assert->node_idx, mle->new_master, 19206714d8e8SKurt Hackel mle->master, namelen, name); 19216714d8e8SKurt Hackel goto kill; 19226714d8e8SKurt Hackel } 19236714d8e8SKurt Hackel 19246714d8e8SKurt Hackel } 19256714d8e8SKurt Hackel ok: 19266714d8e8SKurt Hackel spin_unlock(&res->spinlock); 19276714d8e8SKurt Hackel } 19286714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 19296714d8e8SKurt Hackel 19306714d8e8SKurt Hackel // mlog(0, "woo! got an assert_master from node %u!\n", 19316714d8e8SKurt Hackel // assert->node_idx); 19326714d8e8SKurt Hackel if (mle) { 19339c6510a5SKurt Hackel int extra_ref = 0; 19349c6510a5SKurt Hackel int nn = -1; 1935a2bf0477SKurt Hackel int rr, err = 0; 19366714d8e8SKurt Hackel 19376714d8e8SKurt Hackel spin_lock(&mle->spinlock); 19389c6510a5SKurt Hackel if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) 19399c6510a5SKurt Hackel extra_ref = 1; 19409c6510a5SKurt Hackel else { 19419c6510a5SKurt Hackel /* MASTER mle: if any bits set in the response map 19429c6510a5SKurt Hackel * then the calling node needs to re-assert to clear 19439c6510a5SKurt Hackel * up nodes that this node contacted */ 19449c6510a5SKurt Hackel while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, 19459c6510a5SKurt Hackel nn+1)) < O2NM_MAX_NODES) { 19469c6510a5SKurt Hackel if (nn != dlm->node_num && nn != assert->node_idx) 19479c6510a5SKurt Hackel master_request = 1; 19489c6510a5SKurt Hackel } 19499c6510a5SKurt Hackel } 19506714d8e8SKurt Hackel mle->master = assert->node_idx; 19516714d8e8SKurt Hackel atomic_set(&mle->woken, 1); 19526714d8e8SKurt Hackel wake_up(&mle->wq); 19536714d8e8SKurt Hackel spin_unlock(&mle->spinlock); 19546714d8e8SKurt Hackel 1955a2bf0477SKurt Hackel if (res) { 1956a6fa3640SKurt Hackel int wake = 0; 1957a2bf0477SKurt Hackel spin_lock(&res->spinlock); 1958a2bf0477SKurt Hackel if (mle->type == DLM_MLE_MIGRATION) { 19596714d8e8SKurt Hackel mlog(0, "finishing off migration of lockres %.*s, " 19606714d8e8SKurt Hackel "from %u to %u\n", 19616714d8e8SKurt Hackel res->lockname.len, res->lockname.name, 19626714d8e8SKurt Hackel dlm->node_num, mle->new_master); 19636714d8e8SKurt Hackel res->state &= ~DLM_LOCK_RES_MIGRATING; 1964a6fa3640SKurt Hackel wake = 1; 19656714d8e8SKurt Hackel dlm_change_lockres_owner(dlm, res, mle->new_master); 19666714d8e8SKurt Hackel BUG_ON(res->state & DLM_LOCK_RES_DIRTY); 1967a2bf0477SKurt Hackel } else { 1968a2bf0477SKurt Hackel dlm_change_lockres_owner(dlm, res, mle->master); 1969a2bf0477SKurt Hackel } 19706714d8e8SKurt Hackel spin_unlock(&res->spinlock); 1971ba2bf218SKurt Hackel have_lockres_ref = 1; 1972a6fa3640SKurt Hackel if (wake) 1973a6fa3640SKurt Hackel wake_up(&res->wq); 19746714d8e8SKurt Hackel } 19756714d8e8SKurt Hackel 1976a2bf0477SKurt Hackel /* master is known, detach if not already detached. 1977a2bf0477SKurt Hackel * ensures that only one assert_master call will happen 1978a2bf0477SKurt Hackel * on this mle. */ 1979a2bf0477SKurt Hackel spin_lock(&dlm->spinlock); 1980a2bf0477SKurt Hackel spin_lock(&dlm->master_lock); 1981a2bf0477SKurt Hackel 1982a2bf0477SKurt Hackel rr = atomic_read(&mle->mle_refs.refcount); 1983a2bf0477SKurt Hackel if (mle->inuse > 0) { 1984a2bf0477SKurt Hackel if (extra_ref && rr < 3) 1985a2bf0477SKurt Hackel err = 1; 1986a2bf0477SKurt Hackel else if (!extra_ref && rr < 2) 1987a2bf0477SKurt Hackel err = 1; 1988a2bf0477SKurt Hackel } else { 1989a2bf0477SKurt Hackel if (extra_ref && rr < 2) 1990a2bf0477SKurt Hackel err = 1; 1991a2bf0477SKurt Hackel else if (!extra_ref && rr < 1) 1992a2bf0477SKurt Hackel err = 1; 1993a2bf0477SKurt Hackel } 1994a2bf0477SKurt Hackel if (err) { 1995a2bf0477SKurt Hackel mlog(ML_ERROR, "%s:%.*s: got assert master from %u " 1996a2bf0477SKurt Hackel "that will mess up this node, refs=%d, extra=%d, " 1997a2bf0477SKurt Hackel "inuse=%d\n", dlm->name, namelen, name, 1998a2bf0477SKurt Hackel assert->node_idx, rr, extra_ref, mle->inuse); 1999a2bf0477SKurt Hackel dlm_print_one_mle(mle); 2000a2bf0477SKurt Hackel } 20011c084577SSunil Mushran __dlm_unlink_mle(dlm, mle); 2002a2bf0477SKurt Hackel __dlm_mle_detach_hb_events(dlm, mle); 2003a2bf0477SKurt Hackel __dlm_put_mle(mle); 20046714d8e8SKurt Hackel if (extra_ref) { 20056714d8e8SKurt Hackel /* the assert master message now balances the extra 20066714d8e8SKurt Hackel * ref given by the master / migration request message. 20076714d8e8SKurt Hackel * if this is the last put, it will be removed 20086714d8e8SKurt Hackel * from the list. */ 2009a2bf0477SKurt Hackel __dlm_put_mle(mle); 2010a2bf0477SKurt Hackel } 2011a2bf0477SKurt Hackel spin_unlock(&dlm->master_lock); 2012a2bf0477SKurt Hackel spin_unlock(&dlm->spinlock); 2013a2bf0477SKurt Hackel } else if (res) { 2014a2bf0477SKurt Hackel if (res->owner != assert->node_idx) { 2015a2bf0477SKurt Hackel mlog(0, "assert_master from %u, but current " 2016a2bf0477SKurt Hackel "owner is %u (%.*s), no mle\n", assert->node_idx, 2017a2bf0477SKurt Hackel res->owner, namelen, name); 20186714d8e8SKurt Hackel } 20196714d8e8SKurt Hackel } 20206714d8e8SKurt Hackel 20216714d8e8SKurt Hackel done: 20229c6510a5SKurt Hackel ret = 0; 20233b8118cfSKurt Hackel if (res) { 20243b8118cfSKurt Hackel spin_lock(&res->spinlock); 20253b8118cfSKurt Hackel res->state |= DLM_LOCK_RES_SETREF_INPROG; 20263b8118cfSKurt Hackel spin_unlock(&res->spinlock); 20273b8118cfSKurt Hackel *ret_data = (void *)res; 20283b8118cfSKurt Hackel } 20296714d8e8SKurt Hackel dlm_put(dlm); 20309c6510a5SKurt Hackel if (master_request) { 20319c6510a5SKurt Hackel mlog(0, "need to tell master to reassert\n"); 2032ba2bf218SKurt Hackel /* positive. negative would shoot down the node. */ 2033ba2bf218SKurt Hackel ret |= DLM_ASSERT_RESPONSE_REASSERT; 2034ba2bf218SKurt Hackel if (!have_lockres_ref) { 2035ba2bf218SKurt Hackel mlog(ML_ERROR, "strange, got assert from %u, MASTER " 2036ba2bf218SKurt Hackel "mle present here for %s:%.*s, but no lockres!\n", 2037ba2bf218SKurt Hackel assert->node_idx, dlm->name, namelen, name); 2038ba2bf218SKurt Hackel } 2039ba2bf218SKurt Hackel } 2040ba2bf218SKurt Hackel if (have_lockres_ref) { 2041ba2bf218SKurt Hackel /* let the master know we have a reference to the lockres */ 2042ba2bf218SKurt Hackel ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; 2043ba2bf218SKurt Hackel mlog(0, "%s:%.*s: got assert from %u, need a ref\n", 2044ba2bf218SKurt Hackel dlm->name, namelen, name, assert->node_idx); 20459c6510a5SKurt Hackel } 20469c6510a5SKurt Hackel return ret; 20476714d8e8SKurt Hackel 20486714d8e8SKurt Hackel kill: 20496714d8e8SKurt Hackel /* kill the caller! */ 2050a9ee4c8aSKurt Hackel mlog(ML_ERROR, "Bad message received from another node. Dumping state " 2051a9ee4c8aSKurt Hackel "and killing the other node now! This node is OK and can continue.\n"); 2052a9ee4c8aSKurt Hackel __dlm_print_one_lock_resource(res); 20536714d8e8SKurt Hackel spin_unlock(&res->spinlock); 20546714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 20553b8118cfSKurt Hackel *ret_data = (void *)res; 20566714d8e8SKurt Hackel dlm_put(dlm); 20576714d8e8SKurt Hackel return -EINVAL; 20586714d8e8SKurt Hackel } 20596714d8e8SKurt Hackel 20603b8118cfSKurt Hackel void dlm_assert_master_post_handler(int status, void *data, void *ret_data) 20613b8118cfSKurt Hackel { 20623b8118cfSKurt Hackel struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data; 20633b8118cfSKurt Hackel 20643b8118cfSKurt Hackel if (ret_data) { 20653b8118cfSKurt Hackel spin_lock(&res->spinlock); 20663b8118cfSKurt Hackel res->state &= ~DLM_LOCK_RES_SETREF_INPROG; 20673b8118cfSKurt Hackel spin_unlock(&res->spinlock); 20683b8118cfSKurt Hackel wake_up(&res->wq); 20693b8118cfSKurt Hackel dlm_lockres_put(res); 20703b8118cfSKurt Hackel } 20713b8118cfSKurt Hackel return; 20723b8118cfSKurt Hackel } 20733b8118cfSKurt Hackel 20746714d8e8SKurt Hackel int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, 20756714d8e8SKurt Hackel struct dlm_lock_resource *res, 20766714d8e8SKurt Hackel int ignore_higher, u8 request_from, u32 flags) 20776714d8e8SKurt Hackel { 20786714d8e8SKurt Hackel struct dlm_work_item *item; 2079cd861280SRobert P. J. Day item = kzalloc(sizeof(*item), GFP_NOFS); 20806714d8e8SKurt Hackel if (!item) 20816714d8e8SKurt Hackel return -ENOMEM; 20826714d8e8SKurt Hackel 20836714d8e8SKurt Hackel 20846714d8e8SKurt Hackel /* queue up work for dlm_assert_master_worker */ 20856714d8e8SKurt Hackel dlm_grab(dlm); /* get an extra ref for the work item */ 20866714d8e8SKurt Hackel dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); 20876714d8e8SKurt Hackel item->u.am.lockres = res; /* already have a ref */ 20886714d8e8SKurt Hackel /* can optionally ignore node numbers higher than this node */ 20896714d8e8SKurt Hackel item->u.am.ignore_higher = ignore_higher; 20906714d8e8SKurt Hackel item->u.am.request_from = request_from; 20916714d8e8SKurt Hackel item->u.am.flags = flags; 20926714d8e8SKurt Hackel 20939c6510a5SKurt Hackel if (ignore_higher) 20949c6510a5SKurt Hackel mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, 20959c6510a5SKurt Hackel res->lockname.name); 20969c6510a5SKurt Hackel 20976714d8e8SKurt Hackel spin_lock(&dlm->work_lock); 20986714d8e8SKurt Hackel list_add_tail(&item->list, &dlm->work_list); 20996714d8e8SKurt Hackel spin_unlock(&dlm->work_lock); 21006714d8e8SKurt Hackel 21013156d267SKurt Hackel queue_work(dlm->dlm_worker, &dlm->dispatched_work); 21026714d8e8SKurt Hackel return 0; 21036714d8e8SKurt Hackel } 21046714d8e8SKurt Hackel 21056714d8e8SKurt Hackel static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) 21066714d8e8SKurt Hackel { 21076714d8e8SKurt Hackel struct dlm_ctxt *dlm = data; 21086714d8e8SKurt Hackel int ret = 0; 21096714d8e8SKurt Hackel struct dlm_lock_resource *res; 21106714d8e8SKurt Hackel unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 21116714d8e8SKurt Hackel int ignore_higher; 21126714d8e8SKurt Hackel int bit; 21136714d8e8SKurt Hackel u8 request_from; 21146714d8e8SKurt Hackel u32 flags; 21156714d8e8SKurt Hackel 21166714d8e8SKurt Hackel dlm = item->dlm; 21176714d8e8SKurt Hackel res = item->u.am.lockres; 21186714d8e8SKurt Hackel ignore_higher = item->u.am.ignore_higher; 21196714d8e8SKurt Hackel request_from = item->u.am.request_from; 21206714d8e8SKurt Hackel flags = item->u.am.flags; 21216714d8e8SKurt Hackel 21226714d8e8SKurt Hackel spin_lock(&dlm->spinlock); 21236714d8e8SKurt Hackel memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); 21246714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 21256714d8e8SKurt Hackel 21266714d8e8SKurt Hackel clear_bit(dlm->node_num, nodemap); 21276714d8e8SKurt Hackel if (ignore_higher) { 21286714d8e8SKurt Hackel /* if is this just to clear up mles for nodes below 21296714d8e8SKurt Hackel * this node, do not send the message to the original 21306714d8e8SKurt Hackel * caller or any node number higher than this */ 21316714d8e8SKurt Hackel clear_bit(request_from, nodemap); 21326714d8e8SKurt Hackel bit = dlm->node_num; 21336714d8e8SKurt Hackel while (1) { 21346714d8e8SKurt Hackel bit = find_next_bit(nodemap, O2NM_MAX_NODES, 21356714d8e8SKurt Hackel bit+1); 21366714d8e8SKurt Hackel if (bit >= O2NM_MAX_NODES) 21376714d8e8SKurt Hackel break; 21386714d8e8SKurt Hackel clear_bit(bit, nodemap); 21396714d8e8SKurt Hackel } 21406714d8e8SKurt Hackel } 21416714d8e8SKurt Hackel 214236407488SKurt Hackel /* 214336407488SKurt Hackel * If we're migrating this lock to someone else, we are no 214436407488SKurt Hackel * longer allowed to assert out own mastery. OTOH, we need to 214536407488SKurt Hackel * prevent migration from starting while we're still asserting 214636407488SKurt Hackel * our dominance. The reserved ast delays migration. 214736407488SKurt Hackel */ 214836407488SKurt Hackel spin_lock(&res->spinlock); 214936407488SKurt Hackel if (res->state & DLM_LOCK_RES_MIGRATING) { 215036407488SKurt Hackel mlog(0, "Someone asked us to assert mastery, but we're " 215136407488SKurt Hackel "in the middle of migration. Skipping assert, " 215236407488SKurt Hackel "the new master will handle that.\n"); 215336407488SKurt Hackel spin_unlock(&res->spinlock); 215436407488SKurt Hackel goto put; 215536407488SKurt Hackel } else 215636407488SKurt Hackel __dlm_lockres_reserve_ast(res); 215736407488SKurt Hackel spin_unlock(&res->spinlock); 215836407488SKurt Hackel 21596714d8e8SKurt Hackel /* this call now finishes out the nodemap 21606714d8e8SKurt Hackel * even if one or more nodes die */ 21616714d8e8SKurt Hackel mlog(0, "worker about to master %.*s here, this=%u\n", 21626714d8e8SKurt Hackel res->lockname.len, res->lockname.name, dlm->node_num); 2163ba2bf218SKurt Hackel ret = dlm_do_assert_master(dlm, res, nodemap, flags); 21646714d8e8SKurt Hackel if (ret < 0) { 21656714d8e8SKurt Hackel /* no need to restart, we are done */ 21663b3b84a8SKurt Hackel if (!dlm_is_host_down(ret)) 21676714d8e8SKurt Hackel mlog_errno(ret); 21686714d8e8SKurt Hackel } 21696714d8e8SKurt Hackel 217036407488SKurt Hackel /* Ok, we've asserted ourselves. Let's let migration start. */ 217136407488SKurt Hackel dlm_lockres_release_ast(dlm, res); 217236407488SKurt Hackel 217336407488SKurt Hackel put: 21746714d8e8SKurt Hackel dlm_lockres_put(res); 21756714d8e8SKurt Hackel 21766714d8e8SKurt Hackel mlog(0, "finished with dlm_assert_master_worker\n"); 21776714d8e8SKurt Hackel } 21786714d8e8SKurt Hackel 2179c03872f5SKurt Hackel /* SPECIAL CASE for the $RECOVERY lock used by the recovery thread. 2180c03872f5SKurt Hackel * We cannot wait for node recovery to complete to begin mastering this 2181c03872f5SKurt Hackel * lockres because this lockres is used to kick off recovery! ;-) 2182c03872f5SKurt Hackel * So, do a pre-check on all living nodes to see if any of those nodes 2183c03872f5SKurt Hackel * think that $RECOVERY is currently mastered by a dead node. If so, 2184c03872f5SKurt Hackel * we wait a short time to allow that node to get notified by its own 2185c03872f5SKurt Hackel * heartbeat stack, then check again. All $RECOVERY lock resources 2186c03872f5SKurt Hackel * mastered by dead nodes are purged when the hearbeat callback is 2187c03872f5SKurt Hackel * fired, so we can know for sure that it is safe to continue once 2188c03872f5SKurt Hackel * the node returns a live node or no node. */ 2189c03872f5SKurt Hackel static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, 2190c03872f5SKurt Hackel struct dlm_lock_resource *res) 2191c03872f5SKurt Hackel { 2192c03872f5SKurt Hackel struct dlm_node_iter iter; 2193c03872f5SKurt Hackel int nodenum; 2194c03872f5SKurt Hackel int ret = 0; 2195c03872f5SKurt Hackel u8 master = DLM_LOCK_RES_OWNER_UNKNOWN; 2196c03872f5SKurt Hackel 2197c03872f5SKurt Hackel spin_lock(&dlm->spinlock); 2198c03872f5SKurt Hackel dlm_node_iter_init(dlm->domain_map, &iter); 2199c03872f5SKurt Hackel spin_unlock(&dlm->spinlock); 2200c03872f5SKurt Hackel 2201c03872f5SKurt Hackel while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { 2202c03872f5SKurt Hackel /* do not send to self */ 2203c03872f5SKurt Hackel if (nodenum == dlm->node_num) 2204c03872f5SKurt Hackel continue; 2205c03872f5SKurt Hackel ret = dlm_do_master_requery(dlm, res, nodenum, &master); 2206c03872f5SKurt Hackel if (ret < 0) { 2207c03872f5SKurt Hackel mlog_errno(ret); 2208c03872f5SKurt Hackel if (!dlm_is_host_down(ret)) 2209c03872f5SKurt Hackel BUG(); 2210c03872f5SKurt Hackel /* host is down, so answer for that node would be 2211c03872f5SKurt Hackel * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ 2212f42a100bSKurt Hackel ret = 0; 2213c03872f5SKurt Hackel } 2214c03872f5SKurt Hackel 2215c03872f5SKurt Hackel if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { 2216c03872f5SKurt Hackel /* check to see if this master is in the recovery map */ 2217c03872f5SKurt Hackel spin_lock(&dlm->spinlock); 2218c03872f5SKurt Hackel if (test_bit(master, dlm->recovery_map)) { 2219c03872f5SKurt Hackel mlog(ML_NOTICE, "%s: node %u has not seen " 2220c03872f5SKurt Hackel "node %u go down yet, and thinks the " 2221c03872f5SKurt Hackel "dead node is mastering the recovery " 2222c03872f5SKurt Hackel "lock. must wait.\n", dlm->name, 2223c03872f5SKurt Hackel nodenum, master); 2224c03872f5SKurt Hackel ret = -EAGAIN; 2225c03872f5SKurt Hackel } 2226c03872f5SKurt Hackel spin_unlock(&dlm->spinlock); 2227c03872f5SKurt Hackel mlog(0, "%s: reco lock master is %u\n", dlm->name, 2228c03872f5SKurt Hackel master); 2229c03872f5SKurt Hackel break; 2230c03872f5SKurt Hackel } 2231c03872f5SKurt Hackel } 2232c03872f5SKurt Hackel return ret; 2233c03872f5SKurt Hackel } 2234c03872f5SKurt Hackel 2235ba2bf218SKurt Hackel /* 2236ba2bf218SKurt Hackel * DLM_DEREF_LOCKRES_MSG 2237ba2bf218SKurt Hackel */ 2238ba2bf218SKurt Hackel 2239ba2bf218SKurt Hackel int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 2240ba2bf218SKurt Hackel { 2241ba2bf218SKurt Hackel struct dlm_deref_lockres deref; 2242ba2bf218SKurt Hackel int ret = 0, r; 2243ba2bf218SKurt Hackel const char *lockname; 2244ba2bf218SKurt Hackel unsigned int namelen; 2245ba2bf218SKurt Hackel 2246ba2bf218SKurt Hackel lockname = res->lockname.name; 2247ba2bf218SKurt Hackel namelen = res->lockname.len; 2248ba2bf218SKurt Hackel BUG_ON(namelen > O2NM_MAX_NAME_LEN); 2249ba2bf218SKurt Hackel 2250ba2bf218SKurt Hackel mlog(0, "%s:%.*s: sending deref to %d\n", 2251ba2bf218SKurt Hackel dlm->name, namelen, lockname, res->owner); 2252ba2bf218SKurt Hackel memset(&deref, 0, sizeof(deref)); 2253ba2bf218SKurt Hackel deref.node_idx = dlm->node_num; 2254ba2bf218SKurt Hackel deref.namelen = namelen; 2255ba2bf218SKurt Hackel memcpy(deref.name, lockname, namelen); 2256ba2bf218SKurt Hackel 2257ba2bf218SKurt Hackel ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, 2258ba2bf218SKurt Hackel &deref, sizeof(deref), res->owner, &r); 2259ba2bf218SKurt Hackel if (ret < 0) 2260ba2bf218SKurt Hackel mlog_errno(ret); 2261ba2bf218SKurt Hackel else if (r < 0) { 2262ba2bf218SKurt Hackel /* BAD. other node says I did not have a ref. */ 2263ba2bf218SKurt Hackel mlog(ML_ERROR,"while dropping ref on %s:%.*s " 2264ba2bf218SKurt Hackel "(master=%u) got %d.\n", dlm->name, namelen, 2265ba2bf218SKurt Hackel lockname, res->owner, r); 2266ba2bf218SKurt Hackel dlm_print_one_lock_resource(res); 2267ba2bf218SKurt Hackel BUG(); 2268ba2bf218SKurt Hackel } 2269ba2bf218SKurt Hackel return ret; 2270ba2bf218SKurt Hackel } 2271ba2bf218SKurt Hackel 2272d74c9803SKurt Hackel int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 2273d74c9803SKurt Hackel void **ret_data) 2274ba2bf218SKurt Hackel { 2275ba2bf218SKurt Hackel struct dlm_ctxt *dlm = data; 2276ba2bf218SKurt Hackel struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; 2277ba2bf218SKurt Hackel struct dlm_lock_resource *res = NULL; 2278ba2bf218SKurt Hackel char *name; 2279ba2bf218SKurt Hackel unsigned int namelen; 2280ba2bf218SKurt Hackel int ret = -EINVAL; 2281ba2bf218SKurt Hackel u8 node; 2282ba2bf218SKurt Hackel unsigned int hash; 2283f3f85464SSunil Mushran struct dlm_work_item *item; 2284f3f85464SSunil Mushran int cleared = 0; 2285f3f85464SSunil Mushran int dispatch = 0; 2286ba2bf218SKurt Hackel 2287ba2bf218SKurt Hackel if (!dlm_grab(dlm)) 2288ba2bf218SKurt Hackel return 0; 2289ba2bf218SKurt Hackel 2290ba2bf218SKurt Hackel name = deref->name; 2291ba2bf218SKurt Hackel namelen = deref->namelen; 2292ba2bf218SKurt Hackel node = deref->node_idx; 2293ba2bf218SKurt Hackel 2294ba2bf218SKurt Hackel if (namelen > DLM_LOCKID_NAME_MAX) { 2295ba2bf218SKurt Hackel mlog(ML_ERROR, "Invalid name length!"); 2296ba2bf218SKurt Hackel goto done; 2297ba2bf218SKurt Hackel } 2298ba2bf218SKurt Hackel if (deref->node_idx >= O2NM_MAX_NODES) { 2299ba2bf218SKurt Hackel mlog(ML_ERROR, "Invalid node number: %u\n", node); 2300ba2bf218SKurt Hackel goto done; 2301ba2bf218SKurt Hackel } 2302ba2bf218SKurt Hackel 2303ba2bf218SKurt Hackel hash = dlm_lockid_hash(name, namelen); 2304ba2bf218SKurt Hackel 2305ba2bf218SKurt Hackel spin_lock(&dlm->spinlock); 2306ba2bf218SKurt Hackel res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); 2307ba2bf218SKurt Hackel if (!res) { 2308ba2bf218SKurt Hackel spin_unlock(&dlm->spinlock); 2309ba2bf218SKurt Hackel mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", 2310ba2bf218SKurt Hackel dlm->name, namelen, name); 2311ba2bf218SKurt Hackel goto done; 2312ba2bf218SKurt Hackel } 2313ba2bf218SKurt Hackel spin_unlock(&dlm->spinlock); 2314ba2bf218SKurt Hackel 2315ba2bf218SKurt Hackel spin_lock(&res->spinlock); 2316f3f85464SSunil Mushran if (res->state & DLM_LOCK_RES_SETREF_INPROG) 2317f3f85464SSunil Mushran dispatch = 1; 2318f3f85464SSunil Mushran else { 2319ba2bf218SKurt Hackel BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2320ba2bf218SKurt Hackel if (test_bit(node, res->refmap)) { 2321ba2bf218SKurt Hackel dlm_lockres_clear_refmap_bit(node, res); 2322f3f85464SSunil Mushran cleared = 1; 2323f3f85464SSunil Mushran } 2324ba2bf218SKurt Hackel } 2325ba2bf218SKurt Hackel spin_unlock(&res->spinlock); 2326ba2bf218SKurt Hackel 2327f3f85464SSunil Mushran if (!dispatch) { 2328f3f85464SSunil Mushran if (cleared) 2329ba2bf218SKurt Hackel dlm_lockres_calc_usage(dlm, res); 2330f3f85464SSunil Mushran else { 2331f3f85464SSunil Mushran mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " 2332f3f85464SSunil Mushran "but it is already dropped!\n", dlm->name, 2333f3f85464SSunil Mushran res->lockname.len, res->lockname.name, node); 23342af37ce8STao Ma dlm_print_one_lock_resource(res); 2335f3f85464SSunil Mushran } 2336f3f85464SSunil Mushran ret = 0; 2337f3f85464SSunil Mushran goto done; 2338f3f85464SSunil Mushran } 2339f3f85464SSunil Mushran 2340f3f85464SSunil Mushran item = kzalloc(sizeof(*item), GFP_NOFS); 2341f3f85464SSunil Mushran if (!item) { 2342f3f85464SSunil Mushran ret = -ENOMEM; 2343f3f85464SSunil Mushran mlog_errno(ret); 2344f3f85464SSunil Mushran goto done; 2345f3f85464SSunil Mushran } 2346f3f85464SSunil Mushran 2347f3f85464SSunil Mushran dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL); 2348f3f85464SSunil Mushran item->u.dl.deref_res = res; 2349f3f85464SSunil Mushran item->u.dl.deref_node = node; 2350f3f85464SSunil Mushran 2351f3f85464SSunil Mushran spin_lock(&dlm->work_lock); 2352f3f85464SSunil Mushran list_add_tail(&item->list, &dlm->work_list); 2353f3f85464SSunil Mushran spin_unlock(&dlm->work_lock); 2354f3f85464SSunil Mushran 2355f3f85464SSunil Mushran queue_work(dlm->dlm_worker, &dlm->dispatched_work); 2356f3f85464SSunil Mushran return 0; 2357f3f85464SSunil Mushran 2358ba2bf218SKurt Hackel done: 2359ba2bf218SKurt Hackel if (res) 2360ba2bf218SKurt Hackel dlm_lockres_put(res); 2361ba2bf218SKurt Hackel dlm_put(dlm); 2362f3f85464SSunil Mushran 2363ba2bf218SKurt Hackel return ret; 2364ba2bf218SKurt Hackel } 2365ba2bf218SKurt Hackel 2366f3f85464SSunil Mushran static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) 2367f3f85464SSunil Mushran { 2368f3f85464SSunil Mushran struct dlm_ctxt *dlm; 2369f3f85464SSunil Mushran struct dlm_lock_resource *res; 2370f3f85464SSunil Mushran u8 node; 2371f3f85464SSunil Mushran u8 cleared = 0; 2372f3f85464SSunil Mushran 2373f3f85464SSunil Mushran dlm = item->dlm; 2374f3f85464SSunil Mushran res = item->u.dl.deref_res; 2375f3f85464SSunil Mushran node = item->u.dl.deref_node; 2376f3f85464SSunil Mushran 2377f3f85464SSunil Mushran spin_lock(&res->spinlock); 2378f3f85464SSunil Mushran BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2379f3f85464SSunil Mushran if (test_bit(node, res->refmap)) { 2380f3f85464SSunil Mushran __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); 2381f3f85464SSunil Mushran dlm_lockres_clear_refmap_bit(node, res); 2382f3f85464SSunil Mushran cleared = 1; 2383f3f85464SSunil Mushran } 2384f3f85464SSunil Mushran spin_unlock(&res->spinlock); 2385f3f85464SSunil Mushran 2386f3f85464SSunil Mushran if (cleared) { 2387f3f85464SSunil Mushran mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", 2388f3f85464SSunil Mushran dlm->name, res->lockname.len, res->lockname.name, node); 2389f3f85464SSunil Mushran dlm_lockres_calc_usage(dlm, res); 2390f3f85464SSunil Mushran } else { 2391f3f85464SSunil Mushran mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " 2392f3f85464SSunil Mushran "but it is already dropped!\n", dlm->name, 2393f3f85464SSunil Mushran res->lockname.len, res->lockname.name, node); 23942af37ce8STao Ma dlm_print_one_lock_resource(res); 2395f3f85464SSunil Mushran } 2396f3f85464SSunil Mushran 2397f3f85464SSunil Mushran dlm_lockres_put(res); 2398f3f85464SSunil Mushran } 2399f3f85464SSunil Mushran 24002f5bf1f2SSunil Mushran /* Checks whether the lockres can be migrated. Returns 0 if yes, < 0 24012f5bf1f2SSunil Mushran * if not. If 0, numlocks is set to the number of locks in the lockres. 24022f5bf1f2SSunil Mushran */ 24032f5bf1f2SSunil Mushran static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, 24042f5bf1f2SSunil Mushran struct dlm_lock_resource *res, 24052f5bf1f2SSunil Mushran int *numlocks) 24062f5bf1f2SSunil Mushran { 24072f5bf1f2SSunil Mushran int ret; 24082f5bf1f2SSunil Mushran int i; 24092f5bf1f2SSunil Mushran int count = 0; 2410800deef3SChristoph Hellwig struct list_head *queue; 24112f5bf1f2SSunil Mushran struct dlm_lock *lock; 24122f5bf1f2SSunil Mushran 24132f5bf1f2SSunil Mushran assert_spin_locked(&res->spinlock); 24142f5bf1f2SSunil Mushran 24152f5bf1f2SSunil Mushran ret = -EINVAL; 24162f5bf1f2SSunil Mushran if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { 24172f5bf1f2SSunil Mushran mlog(0, "cannot migrate lockres with unknown owner!\n"); 24182f5bf1f2SSunil Mushran goto leave; 24192f5bf1f2SSunil Mushran } 24202f5bf1f2SSunil Mushran 24212f5bf1f2SSunil Mushran if (res->owner != dlm->node_num) { 24222f5bf1f2SSunil Mushran mlog(0, "cannot migrate lockres this node doesn't own!\n"); 24232f5bf1f2SSunil Mushran goto leave; 24242f5bf1f2SSunil Mushran } 24252f5bf1f2SSunil Mushran 24262f5bf1f2SSunil Mushran ret = 0; 24272f5bf1f2SSunil Mushran queue = &res->granted; 24282f5bf1f2SSunil Mushran for (i = 0; i < 3; i++) { 2429800deef3SChristoph Hellwig list_for_each_entry(lock, queue, list) { 24302f5bf1f2SSunil Mushran ++count; 24312f5bf1f2SSunil Mushran if (lock->ml.node == dlm->node_num) { 24322f5bf1f2SSunil Mushran mlog(0, "found a lock owned by this node still " 24332f5bf1f2SSunil Mushran "on the %s queue! will not migrate this " 24342f5bf1f2SSunil Mushran "lockres\n", (i == 0 ? "granted" : 24352f5bf1f2SSunil Mushran (i == 1 ? "converting" : 24362f5bf1f2SSunil Mushran "blocked"))); 24372f5bf1f2SSunil Mushran ret = -ENOTEMPTY; 24382f5bf1f2SSunil Mushran goto leave; 24392f5bf1f2SSunil Mushran } 24402f5bf1f2SSunil Mushran } 24412f5bf1f2SSunil Mushran queue++; 24422f5bf1f2SSunil Mushran } 24432f5bf1f2SSunil Mushran 24442f5bf1f2SSunil Mushran *numlocks = count; 24452f5bf1f2SSunil Mushran mlog(0, "migrateable lockres having %d locks\n", *numlocks); 24462f5bf1f2SSunil Mushran 24472f5bf1f2SSunil Mushran leave: 24482f5bf1f2SSunil Mushran return ret; 24492f5bf1f2SSunil Mushran } 24506714d8e8SKurt Hackel 24516714d8e8SKurt Hackel /* 24526714d8e8SKurt Hackel * DLM_MIGRATE_LOCKRES 24536714d8e8SKurt Hackel */ 24546714d8e8SKurt Hackel 24556714d8e8SKurt Hackel 2456faf0ec9fSAdrian Bunk static int dlm_migrate_lockres(struct dlm_ctxt *dlm, 2457faf0ec9fSAdrian Bunk struct dlm_lock_resource *res, 24586714d8e8SKurt Hackel u8 target) 24596714d8e8SKurt Hackel { 24606714d8e8SKurt Hackel struct dlm_master_list_entry *mle = NULL; 24616714d8e8SKurt Hackel struct dlm_master_list_entry *oldmle = NULL; 24626714d8e8SKurt Hackel struct dlm_migratable_lockres *mres = NULL; 24632f5bf1f2SSunil Mushran int ret = 0; 24646714d8e8SKurt Hackel const char *name; 24656714d8e8SKurt Hackel unsigned int namelen; 24666714d8e8SKurt Hackel int mle_added = 0; 24672f5bf1f2SSunil Mushran int numlocks; 24682f5bf1f2SSunil Mushran int wake = 0; 24696714d8e8SKurt Hackel 24706714d8e8SKurt Hackel if (!dlm_grab(dlm)) 24716714d8e8SKurt Hackel return -EINVAL; 24726714d8e8SKurt Hackel 24736714d8e8SKurt Hackel name = res->lockname.name; 24746714d8e8SKurt Hackel namelen = res->lockname.len; 24756714d8e8SKurt Hackel 24766714d8e8SKurt Hackel mlog(0, "migrating %.*s to %u\n", namelen, name, target); 24776714d8e8SKurt Hackel 24786714d8e8SKurt Hackel /* 24796714d8e8SKurt Hackel * ensure this lockres is a proper candidate for migration 24806714d8e8SKurt Hackel */ 24816714d8e8SKurt Hackel spin_lock(&res->spinlock); 24822f5bf1f2SSunil Mushran ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); 24832f5bf1f2SSunil Mushran if (ret < 0) { 24846714d8e8SKurt Hackel spin_unlock(&res->spinlock); 24856714d8e8SKurt Hackel goto leave; 24866714d8e8SKurt Hackel } 24876714d8e8SKurt Hackel spin_unlock(&res->spinlock); 24886714d8e8SKurt Hackel 24896714d8e8SKurt Hackel /* no work to do */ 24902f5bf1f2SSunil Mushran if (numlocks == 0) { 24916714d8e8SKurt Hackel mlog(0, "no locks were found on this lockres! done!\n"); 24926714d8e8SKurt Hackel goto leave; 24936714d8e8SKurt Hackel } 24946714d8e8SKurt Hackel 24956714d8e8SKurt Hackel /* 24966714d8e8SKurt Hackel * preallocate up front 24976714d8e8SKurt Hackel * if this fails, abort 24986714d8e8SKurt Hackel */ 24996714d8e8SKurt Hackel 25006714d8e8SKurt Hackel ret = -ENOMEM; 2501ad8100e0SKurt Hackel mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); 25026714d8e8SKurt Hackel if (!mres) { 25036714d8e8SKurt Hackel mlog_errno(ret); 25046714d8e8SKurt Hackel goto leave; 25056714d8e8SKurt Hackel } 25066714d8e8SKurt Hackel 25076714d8e8SKurt Hackel mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, 2508ad8100e0SKurt Hackel GFP_NOFS); 25096714d8e8SKurt Hackel if (!mle) { 25106714d8e8SKurt Hackel mlog_errno(ret); 25116714d8e8SKurt Hackel goto leave; 25126714d8e8SKurt Hackel } 25136714d8e8SKurt Hackel ret = 0; 25146714d8e8SKurt Hackel 25156714d8e8SKurt Hackel /* 25166714d8e8SKurt Hackel * find a node to migrate the lockres to 25176714d8e8SKurt Hackel */ 25186714d8e8SKurt Hackel 25196714d8e8SKurt Hackel mlog(0, "picking a migration node\n"); 25206714d8e8SKurt Hackel spin_lock(&dlm->spinlock); 25216714d8e8SKurt Hackel /* pick a new node */ 25226714d8e8SKurt Hackel if (!test_bit(target, dlm->domain_map) || 25236714d8e8SKurt Hackel target >= O2NM_MAX_NODES) { 25246714d8e8SKurt Hackel target = dlm_pick_migration_target(dlm, res); 25256714d8e8SKurt Hackel } 25266714d8e8SKurt Hackel mlog(0, "node %u chosen for migration\n", target); 25276714d8e8SKurt Hackel 25286714d8e8SKurt Hackel if (target >= O2NM_MAX_NODES || 25296714d8e8SKurt Hackel !test_bit(target, dlm->domain_map)) { 25306714d8e8SKurt Hackel /* target chosen is not alive */ 25316714d8e8SKurt Hackel ret = -EINVAL; 25326714d8e8SKurt Hackel } 25336714d8e8SKurt Hackel 25346714d8e8SKurt Hackel if (ret) { 25356714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 25366714d8e8SKurt Hackel goto fail; 25376714d8e8SKurt Hackel } 25386714d8e8SKurt Hackel 25396714d8e8SKurt Hackel mlog(0, "continuing with target = %u\n", target); 25406714d8e8SKurt Hackel 25416714d8e8SKurt Hackel /* 25426714d8e8SKurt Hackel * clear any existing master requests and 25436714d8e8SKurt Hackel * add the migration mle to the list 25446714d8e8SKurt Hackel */ 25456714d8e8SKurt Hackel spin_lock(&dlm->master_lock); 25466714d8e8SKurt Hackel ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, 25476714d8e8SKurt Hackel namelen, target, dlm->node_num); 25486714d8e8SKurt Hackel spin_unlock(&dlm->master_lock); 25496714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 25506714d8e8SKurt Hackel 25516714d8e8SKurt Hackel if (ret == -EEXIST) { 25526714d8e8SKurt Hackel mlog(0, "another process is already migrating it\n"); 25536714d8e8SKurt Hackel goto fail; 25546714d8e8SKurt Hackel } 25556714d8e8SKurt Hackel mle_added = 1; 25566714d8e8SKurt Hackel 25576714d8e8SKurt Hackel /* 25586714d8e8SKurt Hackel * set the MIGRATING flag and flush asts 25596714d8e8SKurt Hackel * if we fail after this we need to re-dirty the lockres 25606714d8e8SKurt Hackel */ 25616714d8e8SKurt Hackel if (dlm_mark_lockres_migrating(dlm, res, target) < 0) { 25626714d8e8SKurt Hackel mlog(ML_ERROR, "tried to migrate %.*s to %u, but " 25636714d8e8SKurt Hackel "the target went down.\n", res->lockname.len, 25646714d8e8SKurt Hackel res->lockname.name, target); 25656714d8e8SKurt Hackel spin_lock(&res->spinlock); 25666714d8e8SKurt Hackel res->state &= ~DLM_LOCK_RES_MIGRATING; 2567a6fa3640SKurt Hackel wake = 1; 25686714d8e8SKurt Hackel spin_unlock(&res->spinlock); 25696714d8e8SKurt Hackel ret = -EINVAL; 25706714d8e8SKurt Hackel } 25716714d8e8SKurt Hackel 25726714d8e8SKurt Hackel fail: 25736714d8e8SKurt Hackel if (oldmle) { 25746714d8e8SKurt Hackel /* master is known, detach if not already detached */ 25756714d8e8SKurt Hackel dlm_mle_detach_hb_events(dlm, oldmle); 25766714d8e8SKurt Hackel dlm_put_mle(oldmle); 25776714d8e8SKurt Hackel } 25786714d8e8SKurt Hackel 25796714d8e8SKurt Hackel if (ret < 0) { 25806714d8e8SKurt Hackel if (mle_added) { 25816714d8e8SKurt Hackel dlm_mle_detach_hb_events(dlm, mle); 25826714d8e8SKurt Hackel dlm_put_mle(mle); 25836714d8e8SKurt Hackel } else if (mle) { 25846714d8e8SKurt Hackel kmem_cache_free(dlm_mle_cache, mle); 25856714d8e8SKurt Hackel } 25866714d8e8SKurt Hackel goto leave; 25876714d8e8SKurt Hackel } 25886714d8e8SKurt Hackel 25896714d8e8SKurt Hackel /* 25906714d8e8SKurt Hackel * at this point, we have a migration target, an mle 25916714d8e8SKurt Hackel * in the master list, and the MIGRATING flag set on 25926714d8e8SKurt Hackel * the lockres 25936714d8e8SKurt Hackel */ 25946714d8e8SKurt Hackel 25951cd04dbeSKurt Hackel /* now that remote nodes are spinning on the MIGRATING flag, 25961cd04dbeSKurt Hackel * ensure that all assert_master work is flushed. */ 25971cd04dbeSKurt Hackel flush_workqueue(dlm->dlm_worker); 25986714d8e8SKurt Hackel 25996714d8e8SKurt Hackel /* get an extra reference on the mle. 26006714d8e8SKurt Hackel * otherwise the assert_master from the new 26016714d8e8SKurt Hackel * master will destroy this. 26026714d8e8SKurt Hackel * also, make sure that all callers of dlm_get_mle 26036714d8e8SKurt Hackel * take both dlm->spinlock and dlm->master_lock */ 26046714d8e8SKurt Hackel spin_lock(&dlm->spinlock); 26056714d8e8SKurt Hackel spin_lock(&dlm->master_lock); 2606a2bf0477SKurt Hackel dlm_get_mle_inuse(mle); 26076714d8e8SKurt Hackel spin_unlock(&dlm->master_lock); 26086714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 26096714d8e8SKurt Hackel 26106714d8e8SKurt Hackel /* notify new node and send all lock state */ 26116714d8e8SKurt Hackel /* call send_one_lockres with migration flag. 26126714d8e8SKurt Hackel * this serves as notice to the target node that a 26136714d8e8SKurt Hackel * migration is starting. */ 26146714d8e8SKurt Hackel ret = dlm_send_one_lockres(dlm, res, mres, target, 26156714d8e8SKurt Hackel DLM_MRES_MIGRATION); 26166714d8e8SKurt Hackel 26176714d8e8SKurt Hackel if (ret < 0) { 26186714d8e8SKurt Hackel mlog(0, "migration to node %u failed with %d\n", 26196714d8e8SKurt Hackel target, ret); 26206714d8e8SKurt Hackel /* migration failed, detach and clean up mle */ 26216714d8e8SKurt Hackel dlm_mle_detach_hb_events(dlm, mle); 26226714d8e8SKurt Hackel dlm_put_mle(mle); 2623a2bf0477SKurt Hackel dlm_put_mle_inuse(mle); 2624a2bf0477SKurt Hackel spin_lock(&res->spinlock); 2625a2bf0477SKurt Hackel res->state &= ~DLM_LOCK_RES_MIGRATING; 2626a6fa3640SKurt Hackel wake = 1; 2627a2bf0477SKurt Hackel spin_unlock(&res->spinlock); 26286714d8e8SKurt Hackel goto leave; 26296714d8e8SKurt Hackel } 26306714d8e8SKurt Hackel 26316714d8e8SKurt Hackel /* at this point, the target sends a message to all nodes, 26326714d8e8SKurt Hackel * (using dlm_do_migrate_request). this node is skipped since 26336714d8e8SKurt Hackel * we had to put an mle in the list to begin the process. this 26346714d8e8SKurt Hackel * node now waits for target to do an assert master. this node 26356714d8e8SKurt Hackel * will be the last one notified, ensuring that the migration 26366714d8e8SKurt Hackel * is complete everywhere. if the target dies while this is 26376714d8e8SKurt Hackel * going on, some nodes could potentially see the target as the 26386714d8e8SKurt Hackel * master, so it is important that my recovery finds the migration 26396714d8e8SKurt Hackel * mle and sets the master to UNKNONWN. */ 26406714d8e8SKurt Hackel 26416714d8e8SKurt Hackel 26426714d8e8SKurt Hackel /* wait for new node to assert master */ 26436714d8e8SKurt Hackel while (1) { 26446714d8e8SKurt Hackel ret = wait_event_interruptible_timeout(mle->wq, 26456714d8e8SKurt Hackel (atomic_read(&mle->woken) == 1), 26466714d8e8SKurt Hackel msecs_to_jiffies(5000)); 26476714d8e8SKurt Hackel 26486714d8e8SKurt Hackel if (ret >= 0) { 26496714d8e8SKurt Hackel if (atomic_read(&mle->woken) == 1 || 26506714d8e8SKurt Hackel res->owner == target) 26516714d8e8SKurt Hackel break; 26526714d8e8SKurt Hackel 26531cd04dbeSKurt Hackel mlog(0, "%s:%.*s: timed out during migration\n", 26541cd04dbeSKurt Hackel dlm->name, res->lockname.len, res->lockname.name); 2655e2faea4cSKurt Hackel /* avoid hang during shutdown when migrating lockres 2656e2faea4cSKurt Hackel * to a node which also goes down */ 2657e2faea4cSKurt Hackel if (dlm_is_node_dead(dlm, target)) { 2658aa852354SKurt Hackel mlog(0, "%s:%.*s: expected migration " 2659aa852354SKurt Hackel "target %u is no longer up, restarting\n", 2660e2faea4cSKurt Hackel dlm->name, res->lockname.len, 2661e2faea4cSKurt Hackel res->lockname.name, target); 26621cd04dbeSKurt Hackel ret = -EINVAL; 26636714d8e8SKurt Hackel /* migration failed, detach and clean up mle */ 26646714d8e8SKurt Hackel dlm_mle_detach_hb_events(dlm, mle); 26656714d8e8SKurt Hackel dlm_put_mle(mle); 2666a2bf0477SKurt Hackel dlm_put_mle_inuse(mle); 2667a2bf0477SKurt Hackel spin_lock(&res->spinlock); 2668a2bf0477SKurt Hackel res->state &= ~DLM_LOCK_RES_MIGRATING; 2669a6fa3640SKurt Hackel wake = 1; 2670a2bf0477SKurt Hackel spin_unlock(&res->spinlock); 26716714d8e8SKurt Hackel goto leave; 26726714d8e8SKurt Hackel } 26731cd04dbeSKurt Hackel } else 26741cd04dbeSKurt Hackel mlog(0, "%s:%.*s: caught signal during migration\n", 26751cd04dbeSKurt Hackel dlm->name, res->lockname.len, res->lockname.name); 26766714d8e8SKurt Hackel } 26776714d8e8SKurt Hackel 26786714d8e8SKurt Hackel /* all done, set the owner, clear the flag */ 26796714d8e8SKurt Hackel spin_lock(&res->spinlock); 26806714d8e8SKurt Hackel dlm_set_lockres_owner(dlm, res, target); 26816714d8e8SKurt Hackel res->state &= ~DLM_LOCK_RES_MIGRATING; 26826714d8e8SKurt Hackel dlm_remove_nonlocal_locks(dlm, res); 26836714d8e8SKurt Hackel spin_unlock(&res->spinlock); 26846714d8e8SKurt Hackel wake_up(&res->wq); 26856714d8e8SKurt Hackel 26866714d8e8SKurt Hackel /* master is known, detach if not already detached */ 26876714d8e8SKurt Hackel dlm_mle_detach_hb_events(dlm, mle); 2688a2bf0477SKurt Hackel dlm_put_mle_inuse(mle); 26896714d8e8SKurt Hackel ret = 0; 26906714d8e8SKurt Hackel 26916714d8e8SKurt Hackel dlm_lockres_calc_usage(dlm, res); 26926714d8e8SKurt Hackel 26936714d8e8SKurt Hackel leave: 26946714d8e8SKurt Hackel /* re-dirty the lockres if we failed */ 26956714d8e8SKurt Hackel if (ret < 0) 26966714d8e8SKurt Hackel dlm_kick_thread(dlm, res); 26976714d8e8SKurt Hackel 2698a6fa3640SKurt Hackel /* wake up waiters if the MIGRATING flag got set 2699a6fa3640SKurt Hackel * but migration failed */ 2700a6fa3640SKurt Hackel if (wake) 2701a6fa3640SKurt Hackel wake_up(&res->wq); 2702a6fa3640SKurt Hackel 27036714d8e8SKurt Hackel /* TODO: cleanup */ 27046714d8e8SKurt Hackel if (mres) 27056714d8e8SKurt Hackel free_page((unsigned long)mres); 27066714d8e8SKurt Hackel 27076714d8e8SKurt Hackel dlm_put(dlm); 27086714d8e8SKurt Hackel 27096714d8e8SKurt Hackel mlog(0, "returning %d\n", ret); 27106714d8e8SKurt Hackel return ret; 27116714d8e8SKurt Hackel } 27126714d8e8SKurt Hackel 2713ba2bf218SKurt Hackel #define DLM_MIGRATION_RETRY_MS 100 2714ba2bf218SKurt Hackel 2715ba2bf218SKurt Hackel /* Should be called only after beginning the domain leave process. 2716ba2bf218SKurt Hackel * There should not be any remaining locks on nonlocal lock resources, 2717ba2bf218SKurt Hackel * and there should be no local locks left on locally mastered resources. 2718ba2bf218SKurt Hackel * 2719ba2bf218SKurt Hackel * Called with the dlm spinlock held, may drop it to do migration, but 2720ba2bf218SKurt Hackel * will re-acquire before exit. 2721ba2bf218SKurt Hackel * 2722ba2bf218SKurt Hackel * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ 2723ba2bf218SKurt Hackel int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 2724ba2bf218SKurt Hackel { 2725ba2bf218SKurt Hackel int ret; 2726ba2bf218SKurt Hackel int lock_dropped = 0; 27272f5bf1f2SSunil Mushran int numlocks; 2728ba2bf218SKurt Hackel 2729b36c3f84SSunil Mushran spin_lock(&res->spinlock); 2730ba2bf218SKurt Hackel if (res->owner != dlm->node_num) { 2731ba2bf218SKurt Hackel if (!__dlm_lockres_unused(res)) { 2732ba2bf218SKurt Hackel mlog(ML_ERROR, "%s:%.*s: this node is not master, " 2733ba2bf218SKurt Hackel "trying to free this but locks remain\n", 2734ba2bf218SKurt Hackel dlm->name, res->lockname.len, res->lockname.name); 2735ba2bf218SKurt Hackel } 2736b36c3f84SSunil Mushran spin_unlock(&res->spinlock); 2737ba2bf218SKurt Hackel goto leave; 2738ba2bf218SKurt Hackel } 27392f5bf1f2SSunil Mushran 27402f5bf1f2SSunil Mushran /* No need to migrate a lockres having no locks */ 27412f5bf1f2SSunil Mushran ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); 27422f5bf1f2SSunil Mushran if (ret >= 0 && numlocks == 0) { 27432f5bf1f2SSunil Mushran spin_unlock(&res->spinlock); 27442f5bf1f2SSunil Mushran goto leave; 27452f5bf1f2SSunil Mushran } 2746b36c3f84SSunil Mushran spin_unlock(&res->spinlock); 2747ba2bf218SKurt Hackel 2748ba2bf218SKurt Hackel /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ 2749ba2bf218SKurt Hackel spin_unlock(&dlm->spinlock); 2750ba2bf218SKurt Hackel lock_dropped = 1; 2751ba2bf218SKurt Hackel while (1) { 2752ba2bf218SKurt Hackel ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES); 2753ba2bf218SKurt Hackel if (ret >= 0) 2754ba2bf218SKurt Hackel break; 2755ba2bf218SKurt Hackel if (ret == -ENOTEMPTY) { 2756ba2bf218SKurt Hackel mlog(ML_ERROR, "lockres %.*s still has local locks!\n", 2757ba2bf218SKurt Hackel res->lockname.len, res->lockname.name); 2758ba2bf218SKurt Hackel BUG(); 2759ba2bf218SKurt Hackel } 2760ba2bf218SKurt Hackel 2761ba2bf218SKurt Hackel mlog(0, "lockres %.*s: migrate failed, " 2762ba2bf218SKurt Hackel "retrying\n", res->lockname.len, 2763ba2bf218SKurt Hackel res->lockname.name); 2764ba2bf218SKurt Hackel msleep(DLM_MIGRATION_RETRY_MS); 2765ba2bf218SKurt Hackel } 2766ba2bf218SKurt Hackel spin_lock(&dlm->spinlock); 2767ba2bf218SKurt Hackel leave: 2768ba2bf218SKurt Hackel return lock_dropped; 2769ba2bf218SKurt Hackel } 2770ba2bf218SKurt Hackel 27716714d8e8SKurt Hackel int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) 27726714d8e8SKurt Hackel { 27736714d8e8SKurt Hackel int ret; 27746714d8e8SKurt Hackel spin_lock(&dlm->ast_lock); 27756714d8e8SKurt Hackel spin_lock(&lock->spinlock); 27766714d8e8SKurt Hackel ret = (list_empty(&lock->bast_list) && !lock->bast_pending); 27776714d8e8SKurt Hackel spin_unlock(&lock->spinlock); 27786714d8e8SKurt Hackel spin_unlock(&dlm->ast_lock); 27796714d8e8SKurt Hackel return ret; 27806714d8e8SKurt Hackel } 27816714d8e8SKurt Hackel 27826714d8e8SKurt Hackel static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, 27836714d8e8SKurt Hackel struct dlm_lock_resource *res, 27846714d8e8SKurt Hackel u8 mig_target) 27856714d8e8SKurt Hackel { 27866714d8e8SKurt Hackel int can_proceed; 27876714d8e8SKurt Hackel spin_lock(&res->spinlock); 27886714d8e8SKurt Hackel can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); 27896714d8e8SKurt Hackel spin_unlock(&res->spinlock); 27906714d8e8SKurt Hackel 27916714d8e8SKurt Hackel /* target has died, so make the caller break out of the 27926714d8e8SKurt Hackel * wait_event, but caller must recheck the domain_map */ 27936714d8e8SKurt Hackel spin_lock(&dlm->spinlock); 27946714d8e8SKurt Hackel if (!test_bit(mig_target, dlm->domain_map)) 27956714d8e8SKurt Hackel can_proceed = 1; 27966714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 27976714d8e8SKurt Hackel return can_proceed; 27986714d8e8SKurt Hackel } 27996714d8e8SKurt Hackel 2800faf0ec9fSAdrian Bunk static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, 2801faf0ec9fSAdrian Bunk struct dlm_lock_resource *res) 28026714d8e8SKurt Hackel { 28036714d8e8SKurt Hackel int ret; 28046714d8e8SKurt Hackel spin_lock(&res->spinlock); 28056714d8e8SKurt Hackel ret = !!(res->state & DLM_LOCK_RES_DIRTY); 28066714d8e8SKurt Hackel spin_unlock(&res->spinlock); 28076714d8e8SKurt Hackel return ret; 28086714d8e8SKurt Hackel } 28096714d8e8SKurt Hackel 28106714d8e8SKurt Hackel 28116714d8e8SKurt Hackel static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, 28126714d8e8SKurt Hackel struct dlm_lock_resource *res, 28136714d8e8SKurt Hackel u8 target) 28146714d8e8SKurt Hackel { 28156714d8e8SKurt Hackel int ret = 0; 28166714d8e8SKurt Hackel 28176714d8e8SKurt Hackel mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n", 28186714d8e8SKurt Hackel res->lockname.len, res->lockname.name, dlm->node_num, 28196714d8e8SKurt Hackel target); 28206714d8e8SKurt Hackel /* need to set MIGRATING flag on lockres. this is done by 28216714d8e8SKurt Hackel * ensuring that all asts have been flushed for this lockres. */ 28226714d8e8SKurt Hackel spin_lock(&res->spinlock); 28236714d8e8SKurt Hackel BUG_ON(res->migration_pending); 28246714d8e8SKurt Hackel res->migration_pending = 1; 28256714d8e8SKurt Hackel /* strategy is to reserve an extra ast then release 28266714d8e8SKurt Hackel * it below, letting the release do all of the work */ 28276714d8e8SKurt Hackel __dlm_lockres_reserve_ast(res); 28286714d8e8SKurt Hackel spin_unlock(&res->spinlock); 28296714d8e8SKurt Hackel 2830ddc09c8dSKurt Hackel /* now flush all the pending asts */ 28316714d8e8SKurt Hackel dlm_kick_thread(dlm, res); 2832ddc09c8dSKurt Hackel /* before waiting on DIRTY, block processes which may 2833ddc09c8dSKurt Hackel * try to dirty the lockres before MIGRATING is set */ 2834ddc09c8dSKurt Hackel spin_lock(&res->spinlock); 2835ddc09c8dSKurt Hackel BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY); 2836ddc09c8dSKurt Hackel res->state |= DLM_LOCK_RES_BLOCK_DIRTY; 2837ddc09c8dSKurt Hackel spin_unlock(&res->spinlock); 2838ddc09c8dSKurt Hackel /* now wait on any pending asts and the DIRTY state */ 28396714d8e8SKurt Hackel wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); 28406714d8e8SKurt Hackel dlm_lockres_release_ast(dlm, res); 28416714d8e8SKurt Hackel 28426714d8e8SKurt Hackel mlog(0, "about to wait on migration_wq, dirty=%s\n", 28436714d8e8SKurt Hackel res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); 28446714d8e8SKurt Hackel /* if the extra ref we just put was the final one, this 28456714d8e8SKurt Hackel * will pass thru immediately. otherwise, we need to wait 28466714d8e8SKurt Hackel * for the last ast to finish. */ 28476714d8e8SKurt Hackel again: 28486714d8e8SKurt Hackel ret = wait_event_interruptible_timeout(dlm->migration_wq, 28496714d8e8SKurt Hackel dlm_migration_can_proceed(dlm, res, target), 28506714d8e8SKurt Hackel msecs_to_jiffies(1000)); 28516714d8e8SKurt Hackel if (ret < 0) { 28526714d8e8SKurt Hackel mlog(0, "woken again: migrating? %s, dead? %s\n", 28536714d8e8SKurt Hackel res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", 28546714d8e8SKurt Hackel test_bit(target, dlm->domain_map) ? "no":"yes"); 28556714d8e8SKurt Hackel } else { 28566714d8e8SKurt Hackel mlog(0, "all is well: migrating? %s, dead? %s\n", 28576714d8e8SKurt Hackel res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", 28586714d8e8SKurt Hackel test_bit(target, dlm->domain_map) ? "no":"yes"); 28596714d8e8SKurt Hackel } 28606714d8e8SKurt Hackel if (!dlm_migration_can_proceed(dlm, res, target)) { 28616714d8e8SKurt Hackel mlog(0, "trying again...\n"); 28626714d8e8SKurt Hackel goto again; 28636714d8e8SKurt Hackel } 2864ddc09c8dSKurt Hackel /* now that we are sure the MIGRATING state is there, drop 2865ddc09c8dSKurt Hackel * the unneded state which blocked threads trying to DIRTY */ 2866ddc09c8dSKurt Hackel spin_lock(&res->spinlock); 2867ddc09c8dSKurt Hackel BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); 2868ddc09c8dSKurt Hackel BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); 2869ddc09c8dSKurt Hackel res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; 2870ddc09c8dSKurt Hackel spin_unlock(&res->spinlock); 28716714d8e8SKurt Hackel 28726714d8e8SKurt Hackel /* did the target go down or die? */ 28736714d8e8SKurt Hackel spin_lock(&dlm->spinlock); 28746714d8e8SKurt Hackel if (!test_bit(target, dlm->domain_map)) { 28756714d8e8SKurt Hackel mlog(ML_ERROR, "aha. migration target %u just went down\n", 28766714d8e8SKurt Hackel target); 28776714d8e8SKurt Hackel ret = -EHOSTDOWN; 28786714d8e8SKurt Hackel } 28796714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 28806714d8e8SKurt Hackel 28816714d8e8SKurt Hackel /* 28826714d8e8SKurt Hackel * at this point: 28836714d8e8SKurt Hackel * 28846714d8e8SKurt Hackel * o the DLM_LOCK_RES_MIGRATING flag is set 28856714d8e8SKurt Hackel * o there are no pending asts on this lockres 28866714d8e8SKurt Hackel * o all processes trying to reserve an ast on this 28876714d8e8SKurt Hackel * lockres must wait for the MIGRATING flag to clear 28886714d8e8SKurt Hackel */ 28896714d8e8SKurt Hackel return ret; 28906714d8e8SKurt Hackel } 28916714d8e8SKurt Hackel 28926714d8e8SKurt Hackel /* last step in the migration process. 28936714d8e8SKurt Hackel * original master calls this to free all of the dlm_lock 28946714d8e8SKurt Hackel * structures that used to be for other nodes. */ 28956714d8e8SKurt Hackel static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, 28966714d8e8SKurt Hackel struct dlm_lock_resource *res) 28976714d8e8SKurt Hackel { 28986714d8e8SKurt Hackel struct list_head *queue = &res->granted; 2899ba2bf218SKurt Hackel int i, bit; 2900800deef3SChristoph Hellwig struct dlm_lock *lock, *next; 29016714d8e8SKurt Hackel 29026714d8e8SKurt Hackel assert_spin_locked(&res->spinlock); 29036714d8e8SKurt Hackel 29046714d8e8SKurt Hackel BUG_ON(res->owner == dlm->node_num); 29056714d8e8SKurt Hackel 29066714d8e8SKurt Hackel for (i=0; i<3; i++) { 2907800deef3SChristoph Hellwig list_for_each_entry_safe(lock, next, queue, list) { 29086714d8e8SKurt Hackel if (lock->ml.node != dlm->node_num) { 29096714d8e8SKurt Hackel mlog(0, "putting lock for node %u\n", 29106714d8e8SKurt Hackel lock->ml.node); 29116714d8e8SKurt Hackel /* be extra careful */ 29126714d8e8SKurt Hackel BUG_ON(!list_empty(&lock->ast_list)); 29136714d8e8SKurt Hackel BUG_ON(!list_empty(&lock->bast_list)); 29146714d8e8SKurt Hackel BUG_ON(lock->ast_pending); 29156714d8e8SKurt Hackel BUG_ON(lock->bast_pending); 2916ba2bf218SKurt Hackel dlm_lockres_clear_refmap_bit(lock->ml.node, res); 29176714d8e8SKurt Hackel list_del_init(&lock->list); 29186714d8e8SKurt Hackel dlm_lock_put(lock); 29192c5c54acSSunil Mushran /* In a normal unlock, we would have added a 29202c5c54acSSunil Mushran * DLM_UNLOCK_FREE_LOCK action. Force it. */ 29212c5c54acSSunil Mushran dlm_lock_put(lock); 29226714d8e8SKurt Hackel } 29236714d8e8SKurt Hackel } 29246714d8e8SKurt Hackel queue++; 29256714d8e8SKurt Hackel } 2926ba2bf218SKurt Hackel bit = 0; 2927ba2bf218SKurt Hackel while (1) { 2928ba2bf218SKurt Hackel bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); 2929ba2bf218SKurt Hackel if (bit >= O2NM_MAX_NODES) 2930ba2bf218SKurt Hackel break; 2931ba2bf218SKurt Hackel /* do not clear the local node reference, if there is a 2932ba2bf218SKurt Hackel * process holding this, let it drop the ref itself */ 2933ba2bf218SKurt Hackel if (bit != dlm->node_num) { 2934ba2bf218SKurt Hackel mlog(0, "%s:%.*s: node %u had a ref to this " 2935ba2bf218SKurt Hackel "migrating lockres, clearing\n", dlm->name, 2936ba2bf218SKurt Hackel res->lockname.len, res->lockname.name, bit); 2937ba2bf218SKurt Hackel dlm_lockres_clear_refmap_bit(bit, res); 2938ba2bf218SKurt Hackel } 2939ba2bf218SKurt Hackel bit++; 2940ba2bf218SKurt Hackel } 29416714d8e8SKurt Hackel } 29426714d8e8SKurt Hackel 29436714d8e8SKurt Hackel /* for now this is not too intelligent. we will 29446714d8e8SKurt Hackel * need stats to make this do the right thing. 29456714d8e8SKurt Hackel * this just finds the first lock on one of the 29466714d8e8SKurt Hackel * queues and uses that node as the target. */ 29476714d8e8SKurt Hackel static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, 29486714d8e8SKurt Hackel struct dlm_lock_resource *res) 29496714d8e8SKurt Hackel { 29506714d8e8SKurt Hackel int i; 29516714d8e8SKurt Hackel struct list_head *queue = &res->granted; 29526714d8e8SKurt Hackel struct dlm_lock *lock; 29536714d8e8SKurt Hackel int nodenum; 29546714d8e8SKurt Hackel 29556714d8e8SKurt Hackel assert_spin_locked(&dlm->spinlock); 29566714d8e8SKurt Hackel 29576714d8e8SKurt Hackel spin_lock(&res->spinlock); 29586714d8e8SKurt Hackel for (i=0; i<3; i++) { 2959800deef3SChristoph Hellwig list_for_each_entry(lock, queue, list) { 29606714d8e8SKurt Hackel /* up to the caller to make sure this node 29616714d8e8SKurt Hackel * is alive */ 29626714d8e8SKurt Hackel if (lock->ml.node != dlm->node_num) { 29636714d8e8SKurt Hackel spin_unlock(&res->spinlock); 29646714d8e8SKurt Hackel return lock->ml.node; 29656714d8e8SKurt Hackel } 29666714d8e8SKurt Hackel } 29676714d8e8SKurt Hackel queue++; 29686714d8e8SKurt Hackel } 29696714d8e8SKurt Hackel spin_unlock(&res->spinlock); 29706714d8e8SKurt Hackel mlog(0, "have not found a suitable target yet! checking domain map\n"); 29716714d8e8SKurt Hackel 29726714d8e8SKurt Hackel /* ok now we're getting desperate. pick anyone alive. */ 29736714d8e8SKurt Hackel nodenum = -1; 29746714d8e8SKurt Hackel while (1) { 29756714d8e8SKurt Hackel nodenum = find_next_bit(dlm->domain_map, 29766714d8e8SKurt Hackel O2NM_MAX_NODES, nodenum+1); 29776714d8e8SKurt Hackel mlog(0, "found %d in domain map\n", nodenum); 29786714d8e8SKurt Hackel if (nodenum >= O2NM_MAX_NODES) 29796714d8e8SKurt Hackel break; 29806714d8e8SKurt Hackel if (nodenum != dlm->node_num) { 29816714d8e8SKurt Hackel mlog(0, "picking %d\n", nodenum); 29826714d8e8SKurt Hackel return nodenum; 29836714d8e8SKurt Hackel } 29846714d8e8SKurt Hackel } 29856714d8e8SKurt Hackel 29866714d8e8SKurt Hackel mlog(0, "giving up. no master to migrate to\n"); 29876714d8e8SKurt Hackel return DLM_LOCK_RES_OWNER_UNKNOWN; 29886714d8e8SKurt Hackel } 29896714d8e8SKurt Hackel 29906714d8e8SKurt Hackel 29916714d8e8SKurt Hackel 29926714d8e8SKurt Hackel /* this is called by the new master once all lockres 29936714d8e8SKurt Hackel * data has been received */ 29946714d8e8SKurt Hackel static int dlm_do_migrate_request(struct dlm_ctxt *dlm, 29956714d8e8SKurt Hackel struct dlm_lock_resource *res, 29966714d8e8SKurt Hackel u8 master, u8 new_master, 29976714d8e8SKurt Hackel struct dlm_node_iter *iter) 29986714d8e8SKurt Hackel { 29996714d8e8SKurt Hackel struct dlm_migrate_request migrate; 30002b832564SSunil Mushran int ret, skip, status = 0; 30016714d8e8SKurt Hackel int nodenum; 30026714d8e8SKurt Hackel 30036714d8e8SKurt Hackel memset(&migrate, 0, sizeof(migrate)); 30046714d8e8SKurt Hackel migrate.namelen = res->lockname.len; 30056714d8e8SKurt Hackel memcpy(migrate.name, res->lockname.name, migrate.namelen); 30066714d8e8SKurt Hackel migrate.new_master = new_master; 30076714d8e8SKurt Hackel migrate.master = master; 30086714d8e8SKurt Hackel 30096714d8e8SKurt Hackel ret = 0; 30106714d8e8SKurt Hackel 30116714d8e8SKurt Hackel /* send message to all nodes, except the master and myself */ 30126714d8e8SKurt Hackel while ((nodenum = dlm_node_iter_next(iter)) >= 0) { 30136714d8e8SKurt Hackel if (nodenum == master || 30146714d8e8SKurt Hackel nodenum == new_master) 30156714d8e8SKurt Hackel continue; 30166714d8e8SKurt Hackel 30172b832564SSunil Mushran /* We could race exit domain. If exited, skip. */ 30182b832564SSunil Mushran spin_lock(&dlm->spinlock); 30192b832564SSunil Mushran skip = (!test_bit(nodenum, dlm->domain_map)); 30202b832564SSunil Mushran spin_unlock(&dlm->spinlock); 30212b832564SSunil Mushran if (skip) { 30222b832564SSunil Mushran clear_bit(nodenum, iter->node_map); 30232b832564SSunil Mushran continue; 30242b832564SSunil Mushran } 30252b832564SSunil Mushran 30266714d8e8SKurt Hackel ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, 30276714d8e8SKurt Hackel &migrate, sizeof(migrate), nodenum, 30286714d8e8SKurt Hackel &status); 30292b832564SSunil Mushran if (ret < 0) { 30302b832564SSunil Mushran mlog(0, "migrate_request returned %d!\n", ret); 30312b832564SSunil Mushran if (!dlm_is_host_down(ret)) { 30322b832564SSunil Mushran mlog(ML_ERROR, "unhandled error=%d!\n", ret); 30332b832564SSunil Mushran BUG(); 30342b832564SSunil Mushran } 30352b832564SSunil Mushran clear_bit(nodenum, iter->node_map); 30362b832564SSunil Mushran ret = 0; 30372b832564SSunil Mushran } else if (status < 0) { 30386714d8e8SKurt Hackel mlog(0, "migrate request (node %u) returned %d!\n", 30396714d8e8SKurt Hackel nodenum, status); 30406714d8e8SKurt Hackel ret = status; 3041ba2bf218SKurt Hackel } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { 3042ba2bf218SKurt Hackel /* during the migration request we short-circuited 3043ba2bf218SKurt Hackel * the mastery of the lockres. make sure we have 3044ba2bf218SKurt Hackel * a mastery ref for nodenum */ 3045ba2bf218SKurt Hackel mlog(0, "%s:%.*s: need ref for node %u\n", 3046ba2bf218SKurt Hackel dlm->name, res->lockname.len, res->lockname.name, 3047ba2bf218SKurt Hackel nodenum); 3048ba2bf218SKurt Hackel spin_lock(&res->spinlock); 3049ba2bf218SKurt Hackel dlm_lockres_set_refmap_bit(nodenum, res); 3050ba2bf218SKurt Hackel spin_unlock(&res->spinlock); 30516714d8e8SKurt Hackel } 30526714d8e8SKurt Hackel } 30536714d8e8SKurt Hackel 30546714d8e8SKurt Hackel if (ret < 0) 30556714d8e8SKurt Hackel mlog_errno(ret); 30566714d8e8SKurt Hackel 30576714d8e8SKurt Hackel mlog(0, "returning ret=%d\n", ret); 30586714d8e8SKurt Hackel return ret; 30596714d8e8SKurt Hackel } 30606714d8e8SKurt Hackel 30616714d8e8SKurt Hackel 30626714d8e8SKurt Hackel /* if there is an existing mle for this lockres, we now know who the master is. 30636714d8e8SKurt Hackel * (the one who sent us *this* message) we can clear it up right away. 30646714d8e8SKurt Hackel * since the process that put the mle on the list still has a reference to it, 30656714d8e8SKurt Hackel * we can unhash it now, set the master and wake the process. as a result, 30666714d8e8SKurt Hackel * we will have no mle in the list to start with. now we can add an mle for 30676714d8e8SKurt Hackel * the migration and this should be the only one found for those scanning the 30686714d8e8SKurt Hackel * list. */ 3069d74c9803SKurt Hackel int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, 3070d74c9803SKurt Hackel void **ret_data) 30716714d8e8SKurt Hackel { 30726714d8e8SKurt Hackel struct dlm_ctxt *dlm = data; 30736714d8e8SKurt Hackel struct dlm_lock_resource *res = NULL; 30746714d8e8SKurt Hackel struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; 30756714d8e8SKurt Hackel struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; 30766714d8e8SKurt Hackel const char *name; 3077a3d33291SMark Fasheh unsigned int namelen, hash; 30786714d8e8SKurt Hackel int ret = 0; 30796714d8e8SKurt Hackel 30806714d8e8SKurt Hackel if (!dlm_grab(dlm)) 30816714d8e8SKurt Hackel return -EINVAL; 30826714d8e8SKurt Hackel 30836714d8e8SKurt Hackel name = migrate->name; 30846714d8e8SKurt Hackel namelen = migrate->namelen; 3085a3d33291SMark Fasheh hash = dlm_lockid_hash(name, namelen); 30866714d8e8SKurt Hackel 30876714d8e8SKurt Hackel /* preallocate.. if this fails, abort */ 30886714d8e8SKurt Hackel mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, 3089ad8100e0SKurt Hackel GFP_NOFS); 30906714d8e8SKurt Hackel 30916714d8e8SKurt Hackel if (!mle) { 30926714d8e8SKurt Hackel ret = -ENOMEM; 30936714d8e8SKurt Hackel goto leave; 30946714d8e8SKurt Hackel } 30956714d8e8SKurt Hackel 30966714d8e8SKurt Hackel /* check for pre-existing lock */ 30976714d8e8SKurt Hackel spin_lock(&dlm->spinlock); 3098a3d33291SMark Fasheh res = __dlm_lookup_lockres(dlm, name, namelen, hash); 30996714d8e8SKurt Hackel spin_lock(&dlm->master_lock); 31006714d8e8SKurt Hackel 31016714d8e8SKurt Hackel if (res) { 31026714d8e8SKurt Hackel spin_lock(&res->spinlock); 31036714d8e8SKurt Hackel if (res->state & DLM_LOCK_RES_RECOVERING) { 31046714d8e8SKurt Hackel /* if all is working ok, this can only mean that we got 31056714d8e8SKurt Hackel * a migrate request from a node that we now see as 31066714d8e8SKurt Hackel * dead. what can we do here? drop it to the floor? */ 31076714d8e8SKurt Hackel spin_unlock(&res->spinlock); 31086714d8e8SKurt Hackel mlog(ML_ERROR, "Got a migrate request, but the " 31096714d8e8SKurt Hackel "lockres is marked as recovering!"); 31106714d8e8SKurt Hackel kmem_cache_free(dlm_mle_cache, mle); 31116714d8e8SKurt Hackel ret = -EINVAL; /* need a better solution */ 31126714d8e8SKurt Hackel goto unlock; 31136714d8e8SKurt Hackel } 31146714d8e8SKurt Hackel res->state |= DLM_LOCK_RES_MIGRATING; 31156714d8e8SKurt Hackel spin_unlock(&res->spinlock); 31166714d8e8SKurt Hackel } 31176714d8e8SKurt Hackel 31186714d8e8SKurt Hackel /* ignore status. only nonzero status would BUG. */ 31196714d8e8SKurt Hackel ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, 31206714d8e8SKurt Hackel name, namelen, 31216714d8e8SKurt Hackel migrate->new_master, 31226714d8e8SKurt Hackel migrate->master); 31236714d8e8SKurt Hackel 31246714d8e8SKurt Hackel unlock: 31256714d8e8SKurt Hackel spin_unlock(&dlm->master_lock); 31266714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 31276714d8e8SKurt Hackel 31286714d8e8SKurt Hackel if (oldmle) { 31296714d8e8SKurt Hackel /* master is known, detach if not already detached */ 31306714d8e8SKurt Hackel dlm_mle_detach_hb_events(dlm, oldmle); 31316714d8e8SKurt Hackel dlm_put_mle(oldmle); 31326714d8e8SKurt Hackel } 31336714d8e8SKurt Hackel 31346714d8e8SKurt Hackel if (res) 31356714d8e8SKurt Hackel dlm_lockres_put(res); 31366714d8e8SKurt Hackel leave: 31376714d8e8SKurt Hackel dlm_put(dlm); 31386714d8e8SKurt Hackel return ret; 31396714d8e8SKurt Hackel } 31406714d8e8SKurt Hackel 31416714d8e8SKurt Hackel /* must be holding dlm->spinlock and dlm->master_lock 31426714d8e8SKurt Hackel * when adding a migration mle, we can clear any other mles 31436714d8e8SKurt Hackel * in the master list because we know with certainty that 31446714d8e8SKurt Hackel * the master is "master". so we remove any old mle from 31456714d8e8SKurt Hackel * the list after setting it's master field, and then add 31466714d8e8SKurt Hackel * the new migration mle. this way we can hold with the rule 31476714d8e8SKurt Hackel * of having only one mle for a given lock name at all times. */ 31486714d8e8SKurt Hackel static int dlm_add_migration_mle(struct dlm_ctxt *dlm, 31496714d8e8SKurt Hackel struct dlm_lock_resource *res, 31506714d8e8SKurt Hackel struct dlm_master_list_entry *mle, 31516714d8e8SKurt Hackel struct dlm_master_list_entry **oldmle, 31526714d8e8SKurt Hackel const char *name, unsigned int namelen, 31536714d8e8SKurt Hackel u8 new_master, u8 master) 31546714d8e8SKurt Hackel { 31556714d8e8SKurt Hackel int found; 31566714d8e8SKurt Hackel int ret = 0; 31576714d8e8SKurt Hackel 31586714d8e8SKurt Hackel *oldmle = NULL; 31596714d8e8SKurt Hackel 31606714d8e8SKurt Hackel mlog_entry_void(); 31616714d8e8SKurt Hackel 31626714d8e8SKurt Hackel assert_spin_locked(&dlm->spinlock); 31636714d8e8SKurt Hackel assert_spin_locked(&dlm->master_lock); 31646714d8e8SKurt Hackel 31656714d8e8SKurt Hackel /* caller is responsible for any ref taken here on oldmle */ 31666714d8e8SKurt Hackel found = dlm_find_mle(dlm, oldmle, (char *)name, namelen); 31676714d8e8SKurt Hackel if (found) { 31686714d8e8SKurt Hackel struct dlm_master_list_entry *tmp = *oldmle; 31696714d8e8SKurt Hackel spin_lock(&tmp->spinlock); 31706714d8e8SKurt Hackel if (tmp->type == DLM_MLE_MIGRATION) { 31716714d8e8SKurt Hackel if (master == dlm->node_num) { 31726714d8e8SKurt Hackel /* ah another process raced me to it */ 31736714d8e8SKurt Hackel mlog(0, "tried to migrate %.*s, but some " 31746714d8e8SKurt Hackel "process beat me to it\n", 31756714d8e8SKurt Hackel namelen, name); 31766714d8e8SKurt Hackel ret = -EEXIST; 31776714d8e8SKurt Hackel } else { 31786714d8e8SKurt Hackel /* bad. 2 NODES are trying to migrate! */ 31796714d8e8SKurt Hackel mlog(ML_ERROR, "migration error mle: " 31806714d8e8SKurt Hackel "master=%u new_master=%u // request: " 31816714d8e8SKurt Hackel "master=%u new_master=%u // " 31826714d8e8SKurt Hackel "lockres=%.*s\n", 31836714d8e8SKurt Hackel tmp->master, tmp->new_master, 31846714d8e8SKurt Hackel master, new_master, 31856714d8e8SKurt Hackel namelen, name); 31866714d8e8SKurt Hackel BUG(); 31876714d8e8SKurt Hackel } 31886714d8e8SKurt Hackel } else { 31896714d8e8SKurt Hackel /* this is essentially what assert_master does */ 31906714d8e8SKurt Hackel tmp->master = master; 31916714d8e8SKurt Hackel atomic_set(&tmp->woken, 1); 31926714d8e8SKurt Hackel wake_up(&tmp->wq); 31931c084577SSunil Mushran /* remove it so that only one mle will be found */ 31941c084577SSunil Mushran __dlm_unlink_mle(dlm, tmp); 3195ba2bf218SKurt Hackel __dlm_mle_detach_hb_events(dlm, tmp); 3196ba2bf218SKurt Hackel ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; 3197ba2bf218SKurt Hackel mlog(0, "%s:%.*s: master=%u, newmaster=%u, " 3198ba2bf218SKurt Hackel "telling master to get ref for cleared out mle " 3199ba2bf218SKurt Hackel "during migration\n", dlm->name, namelen, name, 3200ba2bf218SKurt Hackel master, new_master); 32016714d8e8SKurt Hackel } 32026714d8e8SKurt Hackel spin_unlock(&tmp->spinlock); 32036714d8e8SKurt Hackel } 32046714d8e8SKurt Hackel 32056714d8e8SKurt Hackel /* now add a migration mle to the tail of the list */ 32066714d8e8SKurt Hackel dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); 32076714d8e8SKurt Hackel mle->new_master = new_master; 3208ba2bf218SKurt Hackel /* the new master will be sending an assert master for this. 3209ba2bf218SKurt Hackel * at that point we will get the refmap reference */ 32106714d8e8SKurt Hackel mle->master = master; 32116714d8e8SKurt Hackel /* do this for consistency with other mle types */ 32126714d8e8SKurt Hackel set_bit(new_master, mle->maybe_map); 32131c084577SSunil Mushran __dlm_insert_mle(dlm, mle); 32146714d8e8SKurt Hackel 32156714d8e8SKurt Hackel return ret; 32166714d8e8SKurt Hackel } 32176714d8e8SKurt Hackel 3218c2cd4a44SSunil Mushran /* 3219c2cd4a44SSunil Mushran * Sets the owner of the lockres, associated to the mle, to UNKNOWN 3220c2cd4a44SSunil Mushran */ 3221c2cd4a44SSunil Mushran static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm, 3222c2cd4a44SSunil Mushran struct dlm_master_list_entry *mle) 3223c2cd4a44SSunil Mushran { 3224c2cd4a44SSunil Mushran struct dlm_lock_resource *res; 3225c2cd4a44SSunil Mushran unsigned int hash; 3226c2cd4a44SSunil Mushran 3227c2cd4a44SSunil Mushran /* Find the lockres associated to the mle and set its owner to UNK */ 3228c2cd4a44SSunil Mushran hash = dlm_lockid_hash(mle->u.mlename.name, mle->u.mlename.len); 3229c2cd4a44SSunil Mushran res = __dlm_lookup_lockres(dlm, mle->u.mlename.name, mle->u.mlename.len, 3230c2cd4a44SSunil Mushran hash); 3231c2cd4a44SSunil Mushran if (res) { 3232c2cd4a44SSunil Mushran spin_unlock(&dlm->master_lock); 3233c2cd4a44SSunil Mushran 3234c2cd4a44SSunil Mushran /* move lockres onto recovery list */ 3235c2cd4a44SSunil Mushran spin_lock(&res->spinlock); 3236c2cd4a44SSunil Mushran dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); 3237c2cd4a44SSunil Mushran dlm_move_lockres_to_recovery_list(dlm, res); 3238c2cd4a44SSunil Mushran spin_unlock(&res->spinlock); 3239c2cd4a44SSunil Mushran dlm_lockres_put(res); 3240c2cd4a44SSunil Mushran 3241c2cd4a44SSunil Mushran /* about to get rid of mle, detach from heartbeat */ 3242c2cd4a44SSunil Mushran __dlm_mle_detach_hb_events(dlm, mle); 3243c2cd4a44SSunil Mushran 3244c2cd4a44SSunil Mushran /* dump the mle */ 3245c2cd4a44SSunil Mushran spin_lock(&dlm->master_lock); 3246c2cd4a44SSunil Mushran __dlm_put_mle(mle); 3247c2cd4a44SSunil Mushran spin_unlock(&dlm->master_lock); 3248c2cd4a44SSunil Mushran } 3249c2cd4a44SSunil Mushran 3250c2cd4a44SSunil Mushran return res; 3251c2cd4a44SSunil Mushran } 3252c2cd4a44SSunil Mushran 3253c2cd4a44SSunil Mushran static void dlm_clean_migration_mle(struct dlm_ctxt *dlm, 3254c2cd4a44SSunil Mushran struct dlm_master_list_entry *mle) 3255c2cd4a44SSunil Mushran { 3256c2cd4a44SSunil Mushran __dlm_mle_detach_hb_events(dlm, mle); 3257c2cd4a44SSunil Mushran 3258c2cd4a44SSunil Mushran spin_lock(&mle->spinlock); 3259c2cd4a44SSunil Mushran __dlm_unlink_mle(dlm, mle); 3260c2cd4a44SSunil Mushran atomic_set(&mle->woken, 1); 3261c2cd4a44SSunil Mushran spin_unlock(&mle->spinlock); 3262c2cd4a44SSunil Mushran 3263c2cd4a44SSunil Mushran wake_up(&mle->wq); 3264c2cd4a44SSunil Mushran } 3265c2cd4a44SSunil Mushran 3266c2cd4a44SSunil Mushran static void dlm_clean_block_mle(struct dlm_ctxt *dlm, 3267c2cd4a44SSunil Mushran struct dlm_master_list_entry *mle, u8 dead_node) 3268c2cd4a44SSunil Mushran { 3269c2cd4a44SSunil Mushran int bit; 3270c2cd4a44SSunil Mushran 3271c2cd4a44SSunil Mushran BUG_ON(mle->type != DLM_MLE_BLOCK); 3272c2cd4a44SSunil Mushran 3273c2cd4a44SSunil Mushran spin_lock(&mle->spinlock); 3274c2cd4a44SSunil Mushran bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); 3275c2cd4a44SSunil Mushran if (bit != dead_node) { 3276c2cd4a44SSunil Mushran mlog(0, "mle found, but dead node %u would not have been " 3277c2cd4a44SSunil Mushran "master\n", dead_node); 3278c2cd4a44SSunil Mushran spin_unlock(&mle->spinlock); 3279c2cd4a44SSunil Mushran } else { 3280c2cd4a44SSunil Mushran /* Must drop the refcount by one since the assert_master will 3281c2cd4a44SSunil Mushran * never arrive. This may result in the mle being unlinked and 3282c2cd4a44SSunil Mushran * freed, but there may still be a process waiting in the 3283c2cd4a44SSunil Mushran * dlmlock path which is fine. */ 3284c2cd4a44SSunil Mushran mlog(0, "node %u was expected master\n", dead_node); 3285c2cd4a44SSunil Mushran atomic_set(&mle->woken, 1); 3286c2cd4a44SSunil Mushran spin_unlock(&mle->spinlock); 3287c2cd4a44SSunil Mushran wake_up(&mle->wq); 3288c2cd4a44SSunil Mushran 3289c2cd4a44SSunil Mushran /* Do not need events any longer, so detach from heartbeat */ 3290c2cd4a44SSunil Mushran __dlm_mle_detach_hb_events(dlm, mle); 3291c2cd4a44SSunil Mushran __dlm_put_mle(mle); 3292c2cd4a44SSunil Mushran } 3293c2cd4a44SSunil Mushran } 32946714d8e8SKurt Hackel 32956714d8e8SKurt Hackel void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) 32966714d8e8SKurt Hackel { 32972ed6c750SSunil Mushran struct dlm_master_list_entry *mle; 32986714d8e8SKurt Hackel struct dlm_lock_resource *res; 32992ed6c750SSunil Mushran struct hlist_head *bucket; 33002ed6c750SSunil Mushran struct hlist_node *list; 33012ed6c750SSunil Mushran unsigned int i; 33026714d8e8SKurt Hackel 33036714d8e8SKurt Hackel mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); 33046714d8e8SKurt Hackel top: 33056714d8e8SKurt Hackel assert_spin_locked(&dlm->spinlock); 33066714d8e8SKurt Hackel 33076714d8e8SKurt Hackel /* clean the master list */ 33086714d8e8SKurt Hackel spin_lock(&dlm->master_lock); 33092ed6c750SSunil Mushran for (i = 0; i < DLM_HASH_BUCKETS; i++) { 33102ed6c750SSunil Mushran bucket = dlm_master_hash(dlm, i); 33112ed6c750SSunil Mushran hlist_for_each(list, bucket) { 33122ed6c750SSunil Mushran mle = hlist_entry(list, struct dlm_master_list_entry, 33132ed6c750SSunil Mushran master_hash_node); 33142ed6c750SSunil Mushran 33156714d8e8SKurt Hackel BUG_ON(mle->type != DLM_MLE_BLOCK && 33166714d8e8SKurt Hackel mle->type != DLM_MLE_MASTER && 33176714d8e8SKurt Hackel mle->type != DLM_MLE_MIGRATION); 33186714d8e8SKurt Hackel 331967ae1f06SSunil Mushran /* MASTER mles are initiated locally. The waiting 332067ae1f06SSunil Mushran * process will notice the node map change shortly. 332167ae1f06SSunil Mushran * Let that happen as normal. */ 33226714d8e8SKurt Hackel if (mle->type == DLM_MLE_MASTER) 33236714d8e8SKurt Hackel continue; 33246714d8e8SKurt Hackel 332567ae1f06SSunil Mushran /* BLOCK mles are initiated by other nodes. Need to 332667ae1f06SSunil Mushran * clean up if the dead node would have been the 332767ae1f06SSunil Mushran * master. */ 33286714d8e8SKurt Hackel if (mle->type == DLM_MLE_BLOCK) { 3329c2cd4a44SSunil Mushran dlm_clean_block_mle(dlm, mle, dead_node); 33306714d8e8SKurt Hackel continue; 33316714d8e8SKurt Hackel } 33326714d8e8SKurt Hackel 333367ae1f06SSunil Mushran /* Everything else is a MIGRATION mle */ 33346714d8e8SKurt Hackel 333567ae1f06SSunil Mushran /* The rule for MIGRATION mles is that the master 333667ae1f06SSunil Mushran * becomes UNKNOWN if *either* the original or the new 333767ae1f06SSunil Mushran * master dies. All UNKNOWN lockres' are sent to 333867ae1f06SSunil Mushran * whichever node becomes the recovery master. The new 333967ae1f06SSunil Mushran * master is responsible for determining if there is 334067ae1f06SSunil Mushran * still a master for this lockres, or if he needs to 334167ae1f06SSunil Mushran * take over mastery. Either way, this node should 334267ae1f06SSunil Mushran * expect another message to resolve this. */ 334367ae1f06SSunil Mushran 33446714d8e8SKurt Hackel if (mle->master != dead_node && 33456714d8e8SKurt Hackel mle->new_master != dead_node) 33466714d8e8SKurt Hackel continue; 33476714d8e8SKurt Hackel 334867ae1f06SSunil Mushran /* If we have reached this point, this mle needs to be 334967ae1f06SSunil Mushran * removed from the list and freed. */ 3350c2cd4a44SSunil Mushran dlm_clean_migration_mle(dlm, mle); 33516714d8e8SKurt Hackel 3352aa852354SKurt Hackel mlog(0, "%s: node %u died during migration from " 335367ae1f06SSunil Mushran "%u to %u!\n", dlm->name, dead_node, mle->master, 335467ae1f06SSunil Mushran mle->new_master); 3355c2cd4a44SSunil Mushran 3356c2cd4a44SSunil Mushran /* If we find a lockres associated with the mle, we've 3357c2cd4a44SSunil Mushran * hit this rare case that messes up our lock ordering. 3358c2cd4a44SSunil Mushran * If so, we need to drop the master lock so that we can 3359c2cd4a44SSunil Mushran * take the lockres lock, meaning that we will have to 33606714d8e8SKurt Hackel * restart from the head of list. */ 3361c2cd4a44SSunil Mushran res = dlm_reset_mleres_owner(dlm, mle); 3362c2cd4a44SSunil Mushran if (res) 33636714d8e8SKurt Hackel /* restart */ 33646714d8e8SKurt Hackel goto top; 33656714d8e8SKurt Hackel 336667ae1f06SSunil Mushran /* This may be the last reference */ 33676714d8e8SKurt Hackel __dlm_put_mle(mle); 33686714d8e8SKurt Hackel } 33692ed6c750SSunil Mushran } 33706714d8e8SKurt Hackel spin_unlock(&dlm->master_lock); 33716714d8e8SKurt Hackel } 33726714d8e8SKurt Hackel 33736714d8e8SKurt Hackel int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 33746714d8e8SKurt Hackel u8 old_master) 33756714d8e8SKurt Hackel { 33766714d8e8SKurt Hackel struct dlm_node_iter iter; 33776714d8e8SKurt Hackel int ret = 0; 33786714d8e8SKurt Hackel 33796714d8e8SKurt Hackel spin_lock(&dlm->spinlock); 33806714d8e8SKurt Hackel dlm_node_iter_init(dlm->domain_map, &iter); 33816714d8e8SKurt Hackel clear_bit(old_master, iter.node_map); 33826714d8e8SKurt Hackel clear_bit(dlm->node_num, iter.node_map); 33836714d8e8SKurt Hackel spin_unlock(&dlm->spinlock); 33846714d8e8SKurt Hackel 3385ba2bf218SKurt Hackel /* ownership of the lockres is changing. account for the 3386ba2bf218SKurt Hackel * mastery reference here since old_master will briefly have 3387ba2bf218SKurt Hackel * a reference after the migration completes */ 3388ba2bf218SKurt Hackel spin_lock(&res->spinlock); 3389ba2bf218SKurt Hackel dlm_lockres_set_refmap_bit(old_master, res); 3390ba2bf218SKurt Hackel spin_unlock(&res->spinlock); 3391ba2bf218SKurt Hackel 33926714d8e8SKurt Hackel mlog(0, "now time to do a migrate request to other nodes\n"); 33936714d8e8SKurt Hackel ret = dlm_do_migrate_request(dlm, res, old_master, 33946714d8e8SKurt Hackel dlm->node_num, &iter); 33956714d8e8SKurt Hackel if (ret < 0) { 33966714d8e8SKurt Hackel mlog_errno(ret); 33976714d8e8SKurt Hackel goto leave; 33986714d8e8SKurt Hackel } 33996714d8e8SKurt Hackel 34006714d8e8SKurt Hackel mlog(0, "doing assert master of %.*s to all except the original node\n", 34016714d8e8SKurt Hackel res->lockname.len, res->lockname.name); 34026714d8e8SKurt Hackel /* this call now finishes out the nodemap 34036714d8e8SKurt Hackel * even if one or more nodes die */ 3404ba2bf218SKurt Hackel ret = dlm_do_assert_master(dlm, res, iter.node_map, 34056714d8e8SKurt Hackel DLM_ASSERT_MASTER_FINISH_MIGRATION); 34066714d8e8SKurt Hackel if (ret < 0) { 34076714d8e8SKurt Hackel /* no longer need to retry. all living nodes contacted. */ 34086714d8e8SKurt Hackel mlog_errno(ret); 34096714d8e8SKurt Hackel ret = 0; 34106714d8e8SKurt Hackel } 34116714d8e8SKurt Hackel 34126714d8e8SKurt Hackel memset(iter.node_map, 0, sizeof(iter.node_map)); 34136714d8e8SKurt Hackel set_bit(old_master, iter.node_map); 34146714d8e8SKurt Hackel mlog(0, "doing assert master of %.*s back to %u\n", 34156714d8e8SKurt Hackel res->lockname.len, res->lockname.name, old_master); 3416ba2bf218SKurt Hackel ret = dlm_do_assert_master(dlm, res, iter.node_map, 34176714d8e8SKurt Hackel DLM_ASSERT_MASTER_FINISH_MIGRATION); 34186714d8e8SKurt Hackel if (ret < 0) { 34196714d8e8SKurt Hackel mlog(0, "assert master to original master failed " 34206714d8e8SKurt Hackel "with %d.\n", ret); 34216714d8e8SKurt Hackel /* the only nonzero status here would be because of 34226714d8e8SKurt Hackel * a dead original node. we're done. */ 34236714d8e8SKurt Hackel ret = 0; 34246714d8e8SKurt Hackel } 34256714d8e8SKurt Hackel 34266714d8e8SKurt Hackel /* all done, set the owner, clear the flag */ 34276714d8e8SKurt Hackel spin_lock(&res->spinlock); 34286714d8e8SKurt Hackel dlm_set_lockres_owner(dlm, res, dlm->node_num); 34296714d8e8SKurt Hackel res->state &= ~DLM_LOCK_RES_MIGRATING; 34306714d8e8SKurt Hackel spin_unlock(&res->spinlock); 34316714d8e8SKurt Hackel /* re-dirty it on the new master */ 34326714d8e8SKurt Hackel dlm_kick_thread(dlm, res); 34336714d8e8SKurt Hackel wake_up(&res->wq); 34346714d8e8SKurt Hackel leave: 34356714d8e8SKurt Hackel return ret; 34366714d8e8SKurt Hackel } 34376714d8e8SKurt Hackel 34386714d8e8SKurt Hackel /* 34396714d8e8SKurt Hackel * LOCKRES AST REFCOUNT 34406714d8e8SKurt Hackel * this is integral to migration 34416714d8e8SKurt Hackel */ 34426714d8e8SKurt Hackel 34436714d8e8SKurt Hackel /* for future intent to call an ast, reserve one ahead of time. 34446714d8e8SKurt Hackel * this should be called only after waiting on the lockres 34456714d8e8SKurt Hackel * with dlm_wait_on_lockres, and while still holding the 34466714d8e8SKurt Hackel * spinlock after the call. */ 34476714d8e8SKurt Hackel void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res) 34486714d8e8SKurt Hackel { 34496714d8e8SKurt Hackel assert_spin_locked(&res->spinlock); 34506714d8e8SKurt Hackel if (res->state & DLM_LOCK_RES_MIGRATING) { 34516714d8e8SKurt Hackel __dlm_print_one_lock_resource(res); 34526714d8e8SKurt Hackel } 34536714d8e8SKurt Hackel BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); 34546714d8e8SKurt Hackel 34556714d8e8SKurt Hackel atomic_inc(&res->asts_reserved); 34566714d8e8SKurt Hackel } 34576714d8e8SKurt Hackel 34586714d8e8SKurt Hackel /* 34596714d8e8SKurt Hackel * used to drop the reserved ast, either because it went unused, 34606714d8e8SKurt Hackel * or because the ast/bast was actually called. 34616714d8e8SKurt Hackel * 34626714d8e8SKurt Hackel * also, if there is a pending migration on this lockres, 34636714d8e8SKurt Hackel * and this was the last pending ast on the lockres, 34646714d8e8SKurt Hackel * atomically set the MIGRATING flag before we drop the lock. 34656714d8e8SKurt Hackel * this is how we ensure that migration can proceed with no 34666714d8e8SKurt Hackel * asts in progress. note that it is ok if the state of the 34676714d8e8SKurt Hackel * queues is such that a lock should be granted in the future 34686714d8e8SKurt Hackel * or that a bast should be fired, because the new master will 34696714d8e8SKurt Hackel * shuffle the lists on this lockres as soon as it is migrated. 34706714d8e8SKurt Hackel */ 34716714d8e8SKurt Hackel void dlm_lockres_release_ast(struct dlm_ctxt *dlm, 34726714d8e8SKurt Hackel struct dlm_lock_resource *res) 34736714d8e8SKurt Hackel { 34746714d8e8SKurt Hackel if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock)) 34756714d8e8SKurt Hackel return; 34766714d8e8SKurt Hackel 34776714d8e8SKurt Hackel if (!res->migration_pending) { 34786714d8e8SKurt Hackel spin_unlock(&res->spinlock); 34796714d8e8SKurt Hackel return; 34806714d8e8SKurt Hackel } 34816714d8e8SKurt Hackel 34826714d8e8SKurt Hackel BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); 34836714d8e8SKurt Hackel res->migration_pending = 0; 34846714d8e8SKurt Hackel res->state |= DLM_LOCK_RES_MIGRATING; 34856714d8e8SKurt Hackel spin_unlock(&res->spinlock); 34866714d8e8SKurt Hackel wake_up(&res->wq); 34876714d8e8SKurt Hackel wake_up(&dlm->migration_wq); 34886714d8e8SKurt Hackel } 3489