1 /* 2 drbd_nl.c 3 4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 10 drbd is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 2, or (at your option) 13 any later version. 14 15 drbd is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with drbd; see the file COPYING. If not, write to 22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 24 */ 25 26 #include <linux/module.h> 27 #include <linux/drbd.h> 28 #include <linux/in.h> 29 #include <linux/fs.h> 30 #include <linux/file.h> 31 #include <linux/slab.h> 32 #include <linux/blkpg.h> 33 #include <linux/cpumask.h> 34 #include "drbd_int.h" 35 #include "drbd_req.h" 36 #include "drbd_wrappers.h" 37 #include <asm/unaligned.h> 38 #include <linux/drbd_limits.h> 39 #include <linux/kthread.h> 40 41 #include <net/genetlink.h> 42 43 /* .doit */ 44 // int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info); 45 // int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info); 46 47 int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info); 48 int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info); 49 50 int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info); 51 int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info); 52 int drbd_adm_down(struct sk_buff *skb, struct genl_info *info); 53 54 int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info); 55 int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info); 56 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info); 57 int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info); 58 int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info); 59 int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info); 60 int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info); 61 int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info); 62 int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info); 63 int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info); 64 int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info); 65 int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info); 66 int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info); 67 int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info); 68 int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info); 69 int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info); 70 int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info); 71 int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info); 72 int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); 73 int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); 74 /* .dumpit */ 75 int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); 76 77 #include <linux/drbd_genl_api.h> 78 #include "drbd_nla.h" 79 #include <linux/genl_magic_func.h> 80 81 /* used blkdev_get_by_path, to claim our meta data device(s) */ 82 static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; 83 84 /* Configuration is strictly serialized, because generic netlink message 85 * processing is strictly serialized by the genl_lock(). 86 * Which means we can use one static global drbd_config_context struct. 87 */ 88 static struct drbd_config_context { 89 /* assigned from drbd_genlmsghdr */ 90 unsigned int minor; 91 /* assigned from request attributes, if present */ 92 unsigned int volume; 93 #define VOLUME_UNSPECIFIED (-1U) 94 /* pointer into the request skb, 95 * limited lifetime! */ 96 char *resource_name; 97 struct nlattr *my_addr; 98 struct nlattr *peer_addr; 99 100 /* reply buffer */ 101 struct sk_buff *reply_skb; 102 /* pointer into reply buffer */ 103 struct drbd_genlmsghdr *reply_dh; 104 /* resolved from attributes, if possible */ 105 struct drbd_conf *mdev; 106 struct drbd_tconn *tconn; 107 } adm_ctx; 108 109 static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) 110 { 111 genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); 112 if (genlmsg_reply(skb, info)) 113 printk(KERN_ERR "drbd: error sending genl reply\n"); 114 } 115 116 /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only 117 * reason it could fail was no space in skb, and there are 4k available. */ 118 int drbd_msg_put_info(const char *info) 119 { 120 struct sk_buff *skb = adm_ctx.reply_skb; 121 struct nlattr *nla; 122 int err = -EMSGSIZE; 123 124 if (!info || !info[0]) 125 return 0; 126 127 nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY); 128 if (!nla) 129 return err; 130 131 err = nla_put_string(skb, T_info_text, info); 132 if (err) { 133 nla_nest_cancel(skb, nla); 134 return err; 135 } else 136 nla_nest_end(skb, nla); 137 return 0; 138 } 139 140 /* This would be a good candidate for a "pre_doit" hook, 141 * and per-family private info->pointers. 142 * But we need to stay compatible with older kernels. 143 * If it returns successfully, adm_ctx members are valid. 144 */ 145 #define DRBD_ADM_NEED_MINOR 1 146 #define DRBD_ADM_NEED_RESOURCE 2 147 #define DRBD_ADM_NEED_CONNECTION 4 148 static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, 149 unsigned flags) 150 { 151 struct drbd_genlmsghdr *d_in = info->userhdr; 152 const u8 cmd = info->genlhdr->cmd; 153 int err; 154 155 memset(&adm_ctx, 0, sizeof(adm_ctx)); 156 157 /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ 158 if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) 159 return -EPERM; 160 161 adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 162 if (!adm_ctx.reply_skb) { 163 err = -ENOMEM; 164 goto fail; 165 } 166 167 adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb, 168 info, &drbd_genl_family, 0, cmd); 169 /* put of a few bytes into a fresh skb of >= 4k will always succeed. 170 * but anyways */ 171 if (!adm_ctx.reply_dh) { 172 err = -ENOMEM; 173 goto fail; 174 } 175 176 adm_ctx.reply_dh->minor = d_in->minor; 177 adm_ctx.reply_dh->ret_code = NO_ERROR; 178 179 adm_ctx.volume = VOLUME_UNSPECIFIED; 180 if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { 181 struct nlattr *nla; 182 /* parse and validate only */ 183 err = drbd_cfg_context_from_attrs(NULL, info); 184 if (err) 185 goto fail; 186 187 /* It was present, and valid, 188 * copy it over to the reply skb. */ 189 err = nla_put_nohdr(adm_ctx.reply_skb, 190 info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, 191 info->attrs[DRBD_NLA_CFG_CONTEXT]); 192 if (err) 193 goto fail; 194 195 /* and assign stuff to the global adm_ctx */ 196 nla = nested_attr_tb[__nla_type(T_ctx_volume)]; 197 if (nla) 198 adm_ctx.volume = nla_get_u32(nla); 199 nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; 200 if (nla) 201 adm_ctx.resource_name = nla_data(nla); 202 adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; 203 adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; 204 if ((adm_ctx.my_addr && 205 nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) || 206 (adm_ctx.peer_addr && 207 nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) { 208 err = -EINVAL; 209 goto fail; 210 } 211 } 212 213 adm_ctx.minor = d_in->minor; 214 adm_ctx.mdev = minor_to_mdev(d_in->minor); 215 adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name); 216 217 if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) { 218 drbd_msg_put_info("unknown minor"); 219 return ERR_MINOR_INVALID; 220 } 221 if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) { 222 drbd_msg_put_info("unknown resource"); 223 return ERR_INVALID_REQUEST; 224 } 225 226 if (flags & DRBD_ADM_NEED_CONNECTION) { 227 if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) { 228 drbd_msg_put_info("no resource name expected"); 229 return ERR_INVALID_REQUEST; 230 } 231 if (adm_ctx.mdev) { 232 drbd_msg_put_info("no minor number expected"); 233 return ERR_INVALID_REQUEST; 234 } 235 if (adm_ctx.my_addr && adm_ctx.peer_addr) 236 adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr), 237 nla_len(adm_ctx.my_addr), 238 nla_data(adm_ctx.peer_addr), 239 nla_len(adm_ctx.peer_addr)); 240 if (!adm_ctx.tconn) { 241 drbd_msg_put_info("unknown connection"); 242 return ERR_INVALID_REQUEST; 243 } 244 } 245 246 /* some more paranoia, if the request was over-determined */ 247 if (adm_ctx.mdev && adm_ctx.tconn && 248 adm_ctx.mdev->tconn != adm_ctx.tconn) { 249 pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n", 250 adm_ctx.minor, adm_ctx.resource_name, 251 adm_ctx.mdev->tconn->name); 252 drbd_msg_put_info("minor exists in different resource"); 253 return ERR_INVALID_REQUEST; 254 } 255 if (adm_ctx.mdev && 256 adm_ctx.volume != VOLUME_UNSPECIFIED && 257 adm_ctx.volume != adm_ctx.mdev->vnr) { 258 pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", 259 adm_ctx.minor, adm_ctx.volume, 260 adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name); 261 drbd_msg_put_info("minor exists as different volume"); 262 return ERR_INVALID_REQUEST; 263 } 264 265 return NO_ERROR; 266 267 fail: 268 nlmsg_free(adm_ctx.reply_skb); 269 adm_ctx.reply_skb = NULL; 270 return err; 271 } 272 273 static int drbd_adm_finish(struct genl_info *info, int retcode) 274 { 275 if (adm_ctx.tconn) { 276 kref_put(&adm_ctx.tconn->kref, &conn_destroy); 277 adm_ctx.tconn = NULL; 278 } 279 280 if (!adm_ctx.reply_skb) 281 return -ENOMEM; 282 283 adm_ctx.reply_dh->ret_code = retcode; 284 drbd_adm_send_reply(adm_ctx.reply_skb, info); 285 return 0; 286 } 287 288 static void setup_khelper_env(struct drbd_tconn *tconn, char **envp) 289 { 290 char *afs; 291 292 /* FIXME: A future version will not allow this case. */ 293 if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0) 294 return; 295 296 switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) { 297 case AF_INET6: 298 afs = "ipv6"; 299 snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6", 300 &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr); 301 break; 302 case AF_INET: 303 afs = "ipv4"; 304 snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", 305 &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); 306 break; 307 default: 308 afs = "ssocks"; 309 snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", 310 &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); 311 } 312 snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs); 313 } 314 315 int drbd_khelper(struct drbd_conf *mdev, char *cmd) 316 { 317 char *envp[] = { "HOME=/", 318 "TERM=linux", 319 "PATH=/sbin:/usr/sbin:/bin:/usr/bin", 320 (char[20]) { }, /* address family */ 321 (char[60]) { }, /* address */ 322 NULL }; 323 char mb[12]; 324 char *argv[] = {usermode_helper, cmd, mb, NULL }; 325 struct drbd_tconn *tconn = mdev->tconn; 326 struct sib_info sib; 327 int ret; 328 329 if (current == tconn->worker.task) 330 set_bit(CALLBACK_PENDING, &tconn->flags); 331 332 snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); 333 setup_khelper_env(tconn, envp); 334 335 /* The helper may take some time. 336 * write out any unsynced meta data changes now */ 337 drbd_md_sync(mdev); 338 339 dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); 340 sib.sib_reason = SIB_HELPER_PRE; 341 sib.helper_name = cmd; 342 drbd_bcast_event(mdev, &sib); 343 ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); 344 if (ret) 345 dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", 346 usermode_helper, cmd, mb, 347 (ret >> 8) & 0xff, ret); 348 else 349 dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", 350 usermode_helper, cmd, mb, 351 (ret >> 8) & 0xff, ret); 352 sib.sib_reason = SIB_HELPER_POST; 353 sib.helper_exit_code = ret; 354 drbd_bcast_event(mdev, &sib); 355 356 if (current == tconn->worker.task) 357 clear_bit(CALLBACK_PENDING, &tconn->flags); 358 359 if (ret < 0) /* Ignore any ERRNOs we got. */ 360 ret = 0; 361 362 return ret; 363 } 364 365 int conn_khelper(struct drbd_tconn *tconn, char *cmd) 366 { 367 char *envp[] = { "HOME=/", 368 "TERM=linux", 369 "PATH=/sbin:/usr/sbin:/bin:/usr/bin", 370 (char[20]) { }, /* address family */ 371 (char[60]) { }, /* address */ 372 NULL }; 373 char *argv[] = {usermode_helper, cmd, tconn->name, NULL }; 374 int ret; 375 376 setup_khelper_env(tconn, envp); 377 conn_md_sync(tconn); 378 379 conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name); 380 /* TODO: conn_bcast_event() ?? */ 381 382 ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); 383 if (ret) 384 conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", 385 usermode_helper, cmd, tconn->name, 386 (ret >> 8) & 0xff, ret); 387 else 388 conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", 389 usermode_helper, cmd, tconn->name, 390 (ret >> 8) & 0xff, ret); 391 /* TODO: conn_bcast_event() ?? */ 392 393 if (ret < 0) /* Ignore any ERRNOs we got. */ 394 ret = 0; 395 396 return ret; 397 } 398 399 static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn) 400 { 401 enum drbd_fencing_p fp = FP_NOT_AVAIL; 402 struct drbd_conf *mdev; 403 int vnr; 404 405 rcu_read_lock(); 406 idr_for_each_entry(&tconn->volumes, mdev, vnr) { 407 if (get_ldev_if_state(mdev, D_CONSISTENT)) { 408 fp = max_t(enum drbd_fencing_p, fp, 409 rcu_dereference(mdev->ldev->disk_conf)->fencing); 410 put_ldev(mdev); 411 } 412 } 413 rcu_read_unlock(); 414 415 return fp; 416 } 417 418 bool conn_try_outdate_peer(struct drbd_tconn *tconn) 419 { 420 unsigned int connect_cnt; 421 union drbd_state mask = { }; 422 union drbd_state val = { }; 423 enum drbd_fencing_p fp; 424 char *ex_to_string; 425 int r; 426 427 if (tconn->cstate >= C_WF_REPORT_PARAMS) { 428 conn_err(tconn, "Expected cstate < C_WF_REPORT_PARAMS\n"); 429 return false; 430 } 431 432 spin_lock_irq(&tconn->req_lock); 433 connect_cnt = tconn->connect_cnt; 434 spin_unlock_irq(&tconn->req_lock); 435 436 fp = highest_fencing_policy(tconn); 437 switch (fp) { 438 case FP_NOT_AVAIL: 439 conn_warn(tconn, "Not fencing peer, I'm not even Consistent myself.\n"); 440 goto out; 441 case FP_DONT_CARE: 442 return true; 443 default: ; 444 } 445 446 r = conn_khelper(tconn, "fence-peer"); 447 448 switch ((r>>8) & 0xff) { 449 case 3: /* peer is inconsistent */ 450 ex_to_string = "peer is inconsistent or worse"; 451 mask.pdsk = D_MASK; 452 val.pdsk = D_INCONSISTENT; 453 break; 454 case 4: /* peer got outdated, or was already outdated */ 455 ex_to_string = "peer was fenced"; 456 mask.pdsk = D_MASK; 457 val.pdsk = D_OUTDATED; 458 break; 459 case 5: /* peer was down */ 460 if (conn_highest_disk(tconn) == D_UP_TO_DATE) { 461 /* we will(have) create(d) a new UUID anyways... */ 462 ex_to_string = "peer is unreachable, assumed to be dead"; 463 mask.pdsk = D_MASK; 464 val.pdsk = D_OUTDATED; 465 } else { 466 ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; 467 } 468 break; 469 case 6: /* Peer is primary, voluntarily outdate myself. 470 * This is useful when an unconnected R_SECONDARY is asked to 471 * become R_PRIMARY, but finds the other peer being active. */ 472 ex_to_string = "peer is active"; 473 conn_warn(tconn, "Peer is primary, outdating myself.\n"); 474 mask.disk = D_MASK; 475 val.disk = D_OUTDATED; 476 break; 477 case 7: 478 if (fp != FP_STONITH) 479 conn_err(tconn, "fence-peer() = 7 && fencing != Stonith !!!\n"); 480 ex_to_string = "peer was stonithed"; 481 mask.pdsk = D_MASK; 482 val.pdsk = D_OUTDATED; 483 break; 484 default: 485 /* The script is broken ... */ 486 conn_err(tconn, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); 487 return false; /* Eventually leave IO frozen */ 488 } 489 490 conn_info(tconn, "fence-peer helper returned %d (%s)\n", 491 (r>>8) & 0xff, ex_to_string); 492 493 out: 494 495 /* Not using 496 conn_request_state(tconn, mask, val, CS_VERBOSE); 497 here, because we might were able to re-establish the connection in the 498 meantime. */ 499 spin_lock_irq(&tconn->req_lock); 500 if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) { 501 if (tconn->connect_cnt != connect_cnt) 502 /* In case the connection was established and droped 503 while the fence-peer handler was running, ignore it */ 504 conn_info(tconn, "Ignoring fence-peer exit code\n"); 505 else 506 _conn_request_state(tconn, mask, val, CS_VERBOSE); 507 } 508 spin_unlock_irq(&tconn->req_lock); 509 510 return conn_highest_pdsk(tconn) <= D_OUTDATED; 511 } 512 513 static int _try_outdate_peer_async(void *data) 514 { 515 struct drbd_tconn *tconn = (struct drbd_tconn *)data; 516 517 conn_try_outdate_peer(tconn); 518 519 kref_put(&tconn->kref, &conn_destroy); 520 return 0; 521 } 522 523 void conn_try_outdate_peer_async(struct drbd_tconn *tconn) 524 { 525 struct task_struct *opa; 526 527 kref_get(&tconn->kref); 528 opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h"); 529 if (IS_ERR(opa)) { 530 conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n"); 531 kref_put(&tconn->kref, &conn_destroy); 532 } 533 } 534 535 enum drbd_state_rv 536 drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) 537 { 538 const int max_tries = 4; 539 enum drbd_state_rv rv = SS_UNKNOWN_ERROR; 540 struct net_conf *nc; 541 int try = 0; 542 int forced = 0; 543 union drbd_state mask, val; 544 545 if (new_role == R_PRIMARY) 546 request_ping(mdev->tconn); /* Detect a dead peer ASAP */ 547 548 mutex_lock(mdev->state_mutex); 549 550 mask.i = 0; mask.role = R_MASK; 551 val.i = 0; val.role = new_role; 552 553 while (try++ < max_tries) { 554 rv = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE); 555 556 /* in case we first succeeded to outdate, 557 * but now suddenly could establish a connection */ 558 if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { 559 val.pdsk = 0; 560 mask.pdsk = 0; 561 continue; 562 } 563 564 if (rv == SS_NO_UP_TO_DATE_DISK && force && 565 (mdev->state.disk < D_UP_TO_DATE && 566 mdev->state.disk >= D_INCONSISTENT)) { 567 mask.disk = D_MASK; 568 val.disk = D_UP_TO_DATE; 569 forced = 1; 570 continue; 571 } 572 573 if (rv == SS_NO_UP_TO_DATE_DISK && 574 mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { 575 D_ASSERT(mdev->state.pdsk == D_UNKNOWN); 576 577 if (conn_try_outdate_peer(mdev->tconn)) { 578 val.disk = D_UP_TO_DATE; 579 mask.disk = D_MASK; 580 } 581 continue; 582 } 583 584 if (rv == SS_NOTHING_TO_DO) 585 goto out; 586 if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { 587 if (!conn_try_outdate_peer(mdev->tconn) && force) { 588 dev_warn(DEV, "Forced into split brain situation!\n"); 589 mask.pdsk = D_MASK; 590 val.pdsk = D_OUTDATED; 591 592 } 593 continue; 594 } 595 if (rv == SS_TWO_PRIMARIES) { 596 /* Maybe the peer is detected as dead very soon... 597 retry at most once more in this case. */ 598 int timeo; 599 rcu_read_lock(); 600 nc = rcu_dereference(mdev->tconn->net_conf); 601 timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; 602 rcu_read_unlock(); 603 schedule_timeout_interruptible(timeo); 604 if (try < max_tries) 605 try = max_tries - 1; 606 continue; 607 } 608 if (rv < SS_SUCCESS) { 609 rv = _drbd_request_state(mdev, mask, val, 610 CS_VERBOSE + CS_WAIT_COMPLETE); 611 if (rv < SS_SUCCESS) 612 goto out; 613 } 614 break; 615 } 616 617 if (rv < SS_SUCCESS) 618 goto out; 619 620 if (forced) 621 dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); 622 623 /* Wait until nothing is on the fly :) */ 624 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); 625 626 /* FIXME also wait for all pending P_BARRIER_ACK? */ 627 628 if (new_role == R_SECONDARY) { 629 set_disk_ro(mdev->vdisk, true); 630 if (get_ldev(mdev)) { 631 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; 632 put_ldev(mdev); 633 } 634 } else { 635 mutex_lock(&mdev->tconn->conf_update); 636 nc = mdev->tconn->net_conf; 637 if (nc) 638 nc->discard_my_data = 0; /* without copy; single bit op is atomic */ 639 mutex_unlock(&mdev->tconn->conf_update); 640 641 set_disk_ro(mdev->vdisk, false); 642 if (get_ldev(mdev)) { 643 if (((mdev->state.conn < C_CONNECTED || 644 mdev->state.pdsk <= D_FAILED) 645 && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced) 646 drbd_uuid_new_current(mdev); 647 648 mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; 649 put_ldev(mdev); 650 } 651 } 652 653 /* writeout of activity log covered areas of the bitmap 654 * to stable storage done in after state change already */ 655 656 if (mdev->state.conn >= C_WF_REPORT_PARAMS) { 657 /* if this was forced, we should consider sync */ 658 if (forced) 659 drbd_send_uuids(mdev); 660 drbd_send_current_state(mdev); 661 } 662 663 drbd_md_sync(mdev); 664 665 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); 666 out: 667 mutex_unlock(mdev->state_mutex); 668 return rv; 669 } 670 671 static const char *from_attrs_err_to_txt(int err) 672 { 673 return err == -ENOMSG ? "required attribute missing" : 674 err == -EOPNOTSUPP ? "unknown mandatory attribute" : 675 err == -EEXIST ? "can not change invariant setting" : 676 "invalid attribute value"; 677 } 678 679 int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) 680 { 681 struct set_role_parms parms; 682 int err; 683 enum drbd_ret_code retcode; 684 685 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 686 if (!adm_ctx.reply_skb) 687 return retcode; 688 if (retcode != NO_ERROR) 689 goto out; 690 691 memset(&parms, 0, sizeof(parms)); 692 if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) { 693 err = set_role_parms_from_attrs(&parms, info); 694 if (err) { 695 retcode = ERR_MANDATORY_TAG; 696 drbd_msg_put_info(from_attrs_err_to_txt(err)); 697 goto out; 698 } 699 } 700 701 if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) 702 retcode = drbd_set_role(adm_ctx.mdev, R_PRIMARY, parms.assume_uptodate); 703 else 704 retcode = drbd_set_role(adm_ctx.mdev, R_SECONDARY, 0); 705 out: 706 drbd_adm_finish(info, retcode); 707 return 0; 708 } 709 710 /* Initializes the md.*_offset members, so we are able to find 711 * the on disk meta data. 712 * 713 * We currently have two possible layouts: 714 * external: 715 * |----------- md_size_sect ------------------| 716 * [ 4k superblock ][ activity log ][ Bitmap ] 717 * | al_offset == 8 | 718 * | bm_offset = al_offset + X | 719 * ==> bitmap sectors = md_size_sect - bm_offset 720 * 721 * internal: 722 * |----------- md_size_sect ------------------| 723 * [data.....][ Bitmap ][ activity log ][ 4k superblock ] 724 * | al_offset < 0 | 725 * | bm_offset = al_offset - Y | 726 * ==> bitmap sectors = Y = al_offset - bm_offset 727 * 728 * Activity log size used to be fixed 32kB, 729 * but is about to become configurable. 730 */ 731 static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, 732 struct drbd_backing_dev *bdev) 733 { 734 sector_t md_size_sect = 0; 735 unsigned int al_size_sect = bdev->md.al_size_4k * 8; 736 737 bdev->md.md_offset = drbd_md_ss(bdev); 738 739 switch (bdev->md.meta_dev_idx) { 740 default: 741 /* v07 style fixed size indexed meta data */ 742 bdev->md.md_size_sect = MD_128MB_SECT; 743 bdev->md.al_offset = MD_4kB_SECT; 744 bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; 745 break; 746 case DRBD_MD_INDEX_FLEX_EXT: 747 /* just occupy the full device; unit: sectors */ 748 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); 749 bdev->md.al_offset = MD_4kB_SECT; 750 bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; 751 break; 752 case DRBD_MD_INDEX_INTERNAL: 753 case DRBD_MD_INDEX_FLEX_INT: 754 /* al size is still fixed */ 755 bdev->md.al_offset = -al_size_sect; 756 /* we need (slightly less than) ~ this much bitmap sectors: */ 757 md_size_sect = drbd_get_capacity(bdev->backing_bdev); 758 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); 759 md_size_sect = BM_SECT_TO_EXT(md_size_sect); 760 md_size_sect = ALIGN(md_size_sect, 8); 761 762 /* plus the "drbd meta data super block", 763 * and the activity log; */ 764 md_size_sect += MD_4kB_SECT + al_size_sect; 765 766 bdev->md.md_size_sect = md_size_sect; 767 /* bitmap offset is adjusted by 'super' block size */ 768 bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT; 769 break; 770 } 771 } 772 773 /* input size is expected to be in KB */ 774 char *ppsize(char *buf, unsigned long long size) 775 { 776 /* Needs 9 bytes at max including trailing NUL: 777 * -1ULL ==> "16384 EB" */ 778 static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; 779 int base = 0; 780 while (size >= 10000 && base < sizeof(units)-1) { 781 /* shift + round */ 782 size = (size >> 10) + !!(size & (1<<9)); 783 base++; 784 } 785 sprintf(buf, "%u %cB", (unsigned)size, units[base]); 786 787 return buf; 788 } 789 790 /* there is still a theoretical deadlock when called from receiver 791 * on an D_INCONSISTENT R_PRIMARY: 792 * remote READ does inc_ap_bio, receiver would need to receive answer 793 * packet from remote to dec_ap_bio again. 794 * receiver receive_sizes(), comes here, 795 * waits for ap_bio_cnt == 0. -> deadlock. 796 * but this cannot happen, actually, because: 797 * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable 798 * (not connected, or bad/no disk on peer): 799 * see drbd_fail_request_early, ap_bio_cnt is zero. 800 * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: 801 * peer may not initiate a resize. 802 */ 803 /* Note these are not to be confused with 804 * drbd_adm_suspend_io/drbd_adm_resume_io, 805 * which are (sub) state changes triggered by admin (drbdsetup), 806 * and can be long lived. 807 * This changes an mdev->flag, is triggered by drbd internals, 808 * and should be short-lived. */ 809 void drbd_suspend_io(struct drbd_conf *mdev) 810 { 811 set_bit(SUSPEND_IO, &mdev->flags); 812 if (drbd_suspended(mdev)) 813 return; 814 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); 815 } 816 817 void drbd_resume_io(struct drbd_conf *mdev) 818 { 819 clear_bit(SUSPEND_IO, &mdev->flags); 820 wake_up(&mdev->misc_wait); 821 } 822 823 /** 824 * drbd_determine_dev_size() - Sets the right device size obeying all constraints 825 * @mdev: DRBD device. 826 * 827 * Returns 0 on success, negative return values indicate errors. 828 * You should call drbd_md_sync() after calling this function. 829 */ 830 enum determine_dev_size 831 drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) 832 { 833 sector_t prev_first_sect, prev_size; /* previous meta location */ 834 sector_t la_size_sect, u_size; 835 struct drbd_md *md = &mdev->ldev->md; 836 u32 prev_al_stripe_size_4k; 837 u32 prev_al_stripes; 838 sector_t size; 839 char ppb[10]; 840 void *buffer; 841 842 int md_moved, la_size_changed; 843 enum determine_dev_size rv = DS_UNCHANGED; 844 845 /* race: 846 * application request passes inc_ap_bio, 847 * but then cannot get an AL-reference. 848 * this function later may wait on ap_bio_cnt == 0. -> deadlock. 849 * 850 * to avoid that: 851 * Suspend IO right here. 852 * still lock the act_log to not trigger ASSERTs there. 853 */ 854 drbd_suspend_io(mdev); 855 buffer = drbd_md_get_buffer(mdev); /* Lock meta-data IO */ 856 if (!buffer) { 857 drbd_resume_io(mdev); 858 return DS_ERROR; 859 } 860 861 /* no wait necessary anymore, actually we could assert that */ 862 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); 863 864 prev_first_sect = drbd_md_first_sector(mdev->ldev); 865 prev_size = mdev->ldev->md.md_size_sect; 866 la_size_sect = mdev->ldev->md.la_size_sect; 867 868 if (rs) { 869 /* rs is non NULL if we should change the AL layout only */ 870 871 prev_al_stripes = md->al_stripes; 872 prev_al_stripe_size_4k = md->al_stripe_size_4k; 873 874 md->al_stripes = rs->al_stripes; 875 md->al_stripe_size_4k = rs->al_stripe_size / 4; 876 md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; 877 } 878 879 drbd_md_set_sector_offsets(mdev, mdev->ldev); 880 881 rcu_read_lock(); 882 u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; 883 rcu_read_unlock(); 884 size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); 885 886 if (size < la_size_sect) { 887 if (rs && u_size == 0) { 888 /* Remove "rs &&" later. This check should always be active, but 889 right now the receiver expects the permissive behavior */ 890 dev_warn(DEV, "Implicit shrink not allowed. " 891 "Use --size=%llus for explicit shrink.\n", 892 (unsigned long long)size); 893 rv = DS_ERROR_SHRINK; 894 } 895 if (u_size > size) 896 rv = DS_ERROR_SPACE_MD; 897 if (rv != DS_UNCHANGED) 898 goto err_out; 899 } 900 901 if (drbd_get_capacity(mdev->this_bdev) != size || 902 drbd_bm_capacity(mdev) != size) { 903 int err; 904 err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC)); 905 if (unlikely(err)) { 906 /* currently there is only one error: ENOMEM! */ 907 size = drbd_bm_capacity(mdev)>>1; 908 if (size == 0) { 909 dev_err(DEV, "OUT OF MEMORY! " 910 "Could not allocate bitmap!\n"); 911 } else { 912 dev_err(DEV, "BM resizing failed. " 913 "Leaving size unchanged at size = %lu KB\n", 914 (unsigned long)size); 915 } 916 rv = DS_ERROR; 917 } 918 /* racy, see comments above. */ 919 drbd_set_my_capacity(mdev, size); 920 mdev->ldev->md.la_size_sect = size; 921 dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), 922 (unsigned long long)size>>1); 923 } 924 if (rv <= DS_ERROR) 925 goto err_out; 926 927 la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); 928 929 md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) 930 || prev_size != mdev->ldev->md.md_size_sect; 931 932 if (la_size_changed || md_moved || rs) { 933 u32 prev_flags; 934 935 drbd_al_shrink(mdev); /* All extents inactive. */ 936 937 prev_flags = md->flags; 938 md->flags &= ~MDF_PRIMARY_IND; 939 drbd_md_write(mdev, buffer); 940 941 dev_info(DEV, "Writing the whole bitmap, %s\n", 942 la_size_changed && md_moved ? "size changed and md moved" : 943 la_size_changed ? "size changed" : "md moved"); 944 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ 945 drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, 946 "size changed", BM_LOCKED_MASK); 947 drbd_initialize_al(mdev, buffer); 948 949 md->flags = prev_flags; 950 drbd_md_write(mdev, buffer); 951 952 if (rs) 953 dev_info(DEV, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n", 954 md->al_stripes, md->al_stripe_size_4k * 4); 955 } 956 957 if (size > la_size_sect) 958 rv = DS_GREW; 959 if (size < la_size_sect) 960 rv = DS_SHRUNK; 961 962 if (0) { 963 err_out: 964 if (rs) { 965 md->al_stripes = prev_al_stripes; 966 md->al_stripe_size_4k = prev_al_stripe_size_4k; 967 md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; 968 969 drbd_md_set_sector_offsets(mdev, mdev->ldev); 970 } 971 } 972 lc_unlock(mdev->act_log); 973 wake_up(&mdev->al_wait); 974 drbd_md_put_buffer(mdev); 975 drbd_resume_io(mdev); 976 977 return rv; 978 } 979 980 sector_t 981 drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, 982 sector_t u_size, int assume_peer_has_space) 983 { 984 sector_t p_size = mdev->p_size; /* partner's disk size. */ 985 sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */ 986 sector_t m_size; /* my size */ 987 sector_t size = 0; 988 989 m_size = drbd_get_max_capacity(bdev); 990 991 if (mdev->state.conn < C_CONNECTED && assume_peer_has_space) { 992 dev_warn(DEV, "Resize while not connected was forced by the user!\n"); 993 p_size = m_size; 994 } 995 996 if (p_size && m_size) { 997 size = min_t(sector_t, p_size, m_size); 998 } else { 999 if (la_size_sect) { 1000 size = la_size_sect; 1001 if (m_size && m_size < size) 1002 size = m_size; 1003 if (p_size && p_size < size) 1004 size = p_size; 1005 } else { 1006 if (m_size) 1007 size = m_size; 1008 if (p_size) 1009 size = p_size; 1010 } 1011 } 1012 1013 if (size == 0) 1014 dev_err(DEV, "Both nodes diskless!\n"); 1015 1016 if (u_size) { 1017 if (u_size > size) 1018 dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n", 1019 (unsigned long)u_size>>1, (unsigned long)size>>1); 1020 else 1021 size = u_size; 1022 } 1023 1024 return size; 1025 } 1026 1027 /** 1028 * drbd_check_al_size() - Ensures that the AL is of the right size 1029 * @mdev: DRBD device. 1030 * 1031 * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation 1032 * failed, and 0 on success. You should call drbd_md_sync() after you called 1033 * this function. 1034 */ 1035 static int drbd_check_al_size(struct drbd_conf *mdev, struct disk_conf *dc) 1036 { 1037 struct lru_cache *n, *t; 1038 struct lc_element *e; 1039 unsigned int in_use; 1040 int i; 1041 1042 if (mdev->act_log && 1043 mdev->act_log->nr_elements == dc->al_extents) 1044 return 0; 1045 1046 in_use = 0; 1047 t = mdev->act_log; 1048 n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION, 1049 dc->al_extents, sizeof(struct lc_element), 0); 1050 1051 if (n == NULL) { 1052 dev_err(DEV, "Cannot allocate act_log lru!\n"); 1053 return -ENOMEM; 1054 } 1055 spin_lock_irq(&mdev->al_lock); 1056 if (t) { 1057 for (i = 0; i < t->nr_elements; i++) { 1058 e = lc_element_by_index(t, i); 1059 if (e->refcnt) 1060 dev_err(DEV, "refcnt(%d)==%d\n", 1061 e->lc_number, e->refcnt); 1062 in_use += e->refcnt; 1063 } 1064 } 1065 if (!in_use) 1066 mdev->act_log = n; 1067 spin_unlock_irq(&mdev->al_lock); 1068 if (in_use) { 1069 dev_err(DEV, "Activity log still in use!\n"); 1070 lc_destroy(n); 1071 return -EBUSY; 1072 } else { 1073 if (t) 1074 lc_destroy(t); 1075 } 1076 drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */ 1077 return 0; 1078 } 1079 1080 static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) 1081 { 1082 struct request_queue * const q = mdev->rq_queue; 1083 unsigned int max_hw_sectors = max_bio_size >> 9; 1084 unsigned int max_segments = 0; 1085 1086 if (get_ldev_if_state(mdev, D_ATTACHING)) { 1087 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; 1088 1089 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); 1090 rcu_read_lock(); 1091 max_segments = rcu_dereference(mdev->ldev->disk_conf)->max_bio_bvecs; 1092 rcu_read_unlock(); 1093 put_ldev(mdev); 1094 } 1095 1096 blk_queue_logical_block_size(q, 512); 1097 blk_queue_max_hw_sectors(q, max_hw_sectors); 1098 /* This is the workaround for "bio would need to, but cannot, be split" */ 1099 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); 1100 blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); 1101 1102 if (get_ldev_if_state(mdev, D_ATTACHING)) { 1103 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; 1104 1105 blk_queue_stack_limits(q, b); 1106 1107 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { 1108 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", 1109 q->backing_dev_info.ra_pages, 1110 b->backing_dev_info.ra_pages); 1111 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; 1112 } 1113 put_ldev(mdev); 1114 } 1115 } 1116 1117 void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) 1118 { 1119 unsigned int now, new, local, peer; 1120 1121 now = queue_max_hw_sectors(mdev->rq_queue) << 9; 1122 local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */ 1123 peer = mdev->peer_max_bio_size; /* Eventually last known value, from meta data */ 1124 1125 if (get_ldev_if_state(mdev, D_ATTACHING)) { 1126 local = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; 1127 mdev->local_max_bio_size = local; 1128 put_ldev(mdev); 1129 } 1130 local = min(local, DRBD_MAX_BIO_SIZE); 1131 1132 /* We may ignore peer limits if the peer is modern enough. 1133 Because new from 8.3.8 onwards the peer can use multiple 1134 BIOs for a single peer_request */ 1135 if (mdev->state.conn >= C_CONNECTED) { 1136 if (mdev->tconn->agreed_pro_version < 94) 1137 peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); 1138 /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ 1139 else if (mdev->tconn->agreed_pro_version == 94) 1140 peer = DRBD_MAX_SIZE_H80_PACKET; 1141 else if (mdev->tconn->agreed_pro_version < 100) 1142 peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ 1143 else 1144 peer = DRBD_MAX_BIO_SIZE; 1145 } 1146 1147 new = min(local, peer); 1148 1149 if (mdev->state.role == R_PRIMARY && new < now) 1150 dev_err(DEV, "ASSERT FAILED new < now; (%u < %u)\n", new, now); 1151 1152 if (new != now) 1153 dev_info(DEV, "max BIO size = %u\n", new); 1154 1155 drbd_setup_queue_param(mdev, new); 1156 } 1157 1158 /* Starts the worker thread */ 1159 static void conn_reconfig_start(struct drbd_tconn *tconn) 1160 { 1161 drbd_thread_start(&tconn->worker); 1162 conn_flush_workqueue(tconn); 1163 } 1164 1165 /* if still unconfigured, stops worker again. */ 1166 static void conn_reconfig_done(struct drbd_tconn *tconn) 1167 { 1168 bool stop_threads; 1169 spin_lock_irq(&tconn->req_lock); 1170 stop_threads = conn_all_vols_unconf(tconn) && 1171 tconn->cstate == C_STANDALONE; 1172 spin_unlock_irq(&tconn->req_lock); 1173 if (stop_threads) { 1174 /* asender is implicitly stopped by receiver 1175 * in conn_disconnect() */ 1176 drbd_thread_stop(&tconn->receiver); 1177 drbd_thread_stop(&tconn->worker); 1178 } 1179 } 1180 1181 /* Make sure IO is suspended before calling this function(). */ 1182 static void drbd_suspend_al(struct drbd_conf *mdev) 1183 { 1184 int s = 0; 1185 1186 if (!lc_try_lock(mdev->act_log)) { 1187 dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n"); 1188 return; 1189 } 1190 1191 drbd_al_shrink(mdev); 1192 spin_lock_irq(&mdev->tconn->req_lock); 1193 if (mdev->state.conn < C_CONNECTED) 1194 s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags); 1195 spin_unlock_irq(&mdev->tconn->req_lock); 1196 lc_unlock(mdev->act_log); 1197 1198 if (s) 1199 dev_info(DEV, "Suspended AL updates\n"); 1200 } 1201 1202 1203 static bool should_set_defaults(struct genl_info *info) 1204 { 1205 unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags; 1206 return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); 1207 } 1208 1209 static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) 1210 { 1211 /* This is limited by 16 bit "slot" numbers, 1212 * and by available on-disk context storage. 1213 * 1214 * Also (u16)~0 is special (denotes a "free" extent). 1215 * 1216 * One transaction occupies one 4kB on-disk block, 1217 * we have n such blocks in the on disk ring buffer, 1218 * the "current" transaction may fail (n-1), 1219 * and there is 919 slot numbers context information per transaction. 1220 * 1221 * 72 transaction blocks amounts to more than 2**16 context slots, 1222 * so cap there first. 1223 */ 1224 const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX; 1225 const unsigned int sufficient_on_disk = 1226 (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1) 1227 /AL_CONTEXT_PER_TRANSACTION; 1228 1229 unsigned int al_size_4k = bdev->md.al_size_4k; 1230 1231 if (al_size_4k > sufficient_on_disk) 1232 return max_al_nr; 1233 1234 return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION; 1235 } 1236 1237 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) 1238 { 1239 enum drbd_ret_code retcode; 1240 struct drbd_conf *mdev; 1241 struct disk_conf *new_disk_conf, *old_disk_conf; 1242 struct fifo_buffer *old_plan = NULL, *new_plan = NULL; 1243 int err, fifo_size; 1244 1245 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 1246 if (!adm_ctx.reply_skb) 1247 return retcode; 1248 if (retcode != NO_ERROR) 1249 goto out; 1250 1251 mdev = adm_ctx.mdev; 1252 1253 /* we also need a disk 1254 * to change the options on */ 1255 if (!get_ldev(mdev)) { 1256 retcode = ERR_NO_DISK; 1257 goto out; 1258 } 1259 1260 new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); 1261 if (!new_disk_conf) { 1262 retcode = ERR_NOMEM; 1263 goto fail; 1264 } 1265 1266 mutex_lock(&mdev->tconn->conf_update); 1267 old_disk_conf = mdev->ldev->disk_conf; 1268 *new_disk_conf = *old_disk_conf; 1269 if (should_set_defaults(info)) 1270 set_disk_conf_defaults(new_disk_conf); 1271 1272 err = disk_conf_from_attrs_for_change(new_disk_conf, info); 1273 if (err && err != -ENOMSG) { 1274 retcode = ERR_MANDATORY_TAG; 1275 drbd_msg_put_info(from_attrs_err_to_txt(err)); 1276 } 1277 1278 if (!expect(new_disk_conf->resync_rate >= 1)) 1279 new_disk_conf->resync_rate = 1; 1280 1281 if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) 1282 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; 1283 if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev)) 1284 new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev); 1285 1286 if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) 1287 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; 1288 1289 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; 1290 if (fifo_size != mdev->rs_plan_s->size) { 1291 new_plan = fifo_alloc(fifo_size); 1292 if (!new_plan) { 1293 dev_err(DEV, "kmalloc of fifo_buffer failed"); 1294 retcode = ERR_NOMEM; 1295 goto fail_unlock; 1296 } 1297 } 1298 1299 drbd_suspend_io(mdev); 1300 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); 1301 drbd_al_shrink(mdev); 1302 err = drbd_check_al_size(mdev, new_disk_conf); 1303 lc_unlock(mdev->act_log); 1304 wake_up(&mdev->al_wait); 1305 drbd_resume_io(mdev); 1306 1307 if (err) { 1308 retcode = ERR_NOMEM; 1309 goto fail_unlock; 1310 } 1311 1312 write_lock_irq(&global_state_lock); 1313 retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after); 1314 if (retcode == NO_ERROR) { 1315 rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); 1316 drbd_resync_after_changed(mdev); 1317 } 1318 write_unlock_irq(&global_state_lock); 1319 1320 if (retcode != NO_ERROR) 1321 goto fail_unlock; 1322 1323 if (new_plan) { 1324 old_plan = mdev->rs_plan_s; 1325 rcu_assign_pointer(mdev->rs_plan_s, new_plan); 1326 } 1327 1328 mutex_unlock(&mdev->tconn->conf_update); 1329 1330 if (new_disk_conf->al_updates) 1331 mdev->ldev->md.flags &= ~MDF_AL_DISABLED; 1332 else 1333 mdev->ldev->md.flags |= MDF_AL_DISABLED; 1334 1335 if (new_disk_conf->md_flushes) 1336 clear_bit(MD_NO_FUA, &mdev->flags); 1337 else 1338 set_bit(MD_NO_FUA, &mdev->flags); 1339 1340 drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush); 1341 1342 drbd_md_sync(mdev); 1343 1344 if (mdev->state.conn >= C_CONNECTED) 1345 drbd_send_sync_param(mdev); 1346 1347 synchronize_rcu(); 1348 kfree(old_disk_conf); 1349 kfree(old_plan); 1350 mod_timer(&mdev->request_timer, jiffies + HZ); 1351 goto success; 1352 1353 fail_unlock: 1354 mutex_unlock(&mdev->tconn->conf_update); 1355 fail: 1356 kfree(new_disk_conf); 1357 kfree(new_plan); 1358 success: 1359 put_ldev(mdev); 1360 out: 1361 drbd_adm_finish(info, retcode); 1362 return 0; 1363 } 1364 1365 int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) 1366 { 1367 struct drbd_conf *mdev; 1368 int err; 1369 enum drbd_ret_code retcode; 1370 enum determine_dev_size dd; 1371 sector_t max_possible_sectors; 1372 sector_t min_md_device_sectors; 1373 struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ 1374 struct disk_conf *new_disk_conf = NULL; 1375 struct block_device *bdev; 1376 struct lru_cache *resync_lru = NULL; 1377 struct fifo_buffer *new_plan = NULL; 1378 union drbd_state ns, os; 1379 enum drbd_state_rv rv; 1380 struct net_conf *nc; 1381 1382 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 1383 if (!adm_ctx.reply_skb) 1384 return retcode; 1385 if (retcode != NO_ERROR) 1386 goto finish; 1387 1388 mdev = adm_ctx.mdev; 1389 conn_reconfig_start(mdev->tconn); 1390 1391 /* if you want to reconfigure, please tear down first */ 1392 if (mdev->state.disk > D_DISKLESS) { 1393 retcode = ERR_DISK_CONFIGURED; 1394 goto fail; 1395 } 1396 /* It may just now have detached because of IO error. Make sure 1397 * drbd_ldev_destroy is done already, we may end up here very fast, 1398 * e.g. if someone calls attach from the on-io-error handler, 1399 * to realize a "hot spare" feature (not that I'd recommend that) */ 1400 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); 1401 1402 /* make sure there is no leftover from previous force-detach attempts */ 1403 clear_bit(FORCE_DETACH, &mdev->flags); 1404 clear_bit(WAS_IO_ERROR, &mdev->flags); 1405 clear_bit(WAS_READ_ERROR, &mdev->flags); 1406 1407 /* and no leftover from previously aborted resync or verify, either */ 1408 mdev->rs_total = 0; 1409 mdev->rs_failed = 0; 1410 atomic_set(&mdev->rs_pending_cnt, 0); 1411 1412 /* allocation not in the IO path, drbdsetup context */ 1413 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); 1414 if (!nbc) { 1415 retcode = ERR_NOMEM; 1416 goto fail; 1417 } 1418 spin_lock_init(&nbc->md.uuid_lock); 1419 1420 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); 1421 if (!new_disk_conf) { 1422 retcode = ERR_NOMEM; 1423 goto fail; 1424 } 1425 nbc->disk_conf = new_disk_conf; 1426 1427 set_disk_conf_defaults(new_disk_conf); 1428 err = disk_conf_from_attrs(new_disk_conf, info); 1429 if (err) { 1430 retcode = ERR_MANDATORY_TAG; 1431 drbd_msg_put_info(from_attrs_err_to_txt(err)); 1432 goto fail; 1433 } 1434 1435 if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) 1436 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; 1437 1438 new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); 1439 if (!new_plan) { 1440 retcode = ERR_NOMEM; 1441 goto fail; 1442 } 1443 1444 if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { 1445 retcode = ERR_MD_IDX_INVALID; 1446 goto fail; 1447 } 1448 1449 write_lock_irq(&global_state_lock); 1450 retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after); 1451 write_unlock_irq(&global_state_lock); 1452 if (retcode != NO_ERROR) 1453 goto fail; 1454 1455 rcu_read_lock(); 1456 nc = rcu_dereference(mdev->tconn->net_conf); 1457 if (nc) { 1458 if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) { 1459 rcu_read_unlock(); 1460 retcode = ERR_STONITH_AND_PROT_A; 1461 goto fail; 1462 } 1463 } 1464 rcu_read_unlock(); 1465 1466 bdev = blkdev_get_by_path(new_disk_conf->backing_dev, 1467 FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev); 1468 if (IS_ERR(bdev)) { 1469 dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, 1470 PTR_ERR(bdev)); 1471 retcode = ERR_OPEN_DISK; 1472 goto fail; 1473 } 1474 nbc->backing_bdev = bdev; 1475 1476 /* 1477 * meta_dev_idx >= 0: external fixed size, possibly multiple 1478 * drbd sharing one meta device. TODO in that case, paranoia 1479 * check that [md_bdev, meta_dev_idx] is not yet used by some 1480 * other drbd minor! (if you use drbd.conf + drbdadm, that 1481 * should check it for you already; but if you don't, or 1482 * someone fooled it, we need to double check here) 1483 */ 1484 bdev = blkdev_get_by_path(new_disk_conf->meta_dev, 1485 FMODE_READ | FMODE_WRITE | FMODE_EXCL, 1486 (new_disk_conf->meta_dev_idx < 0) ? 1487 (void *)mdev : (void *)drbd_m_holder); 1488 if (IS_ERR(bdev)) { 1489 dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, 1490 PTR_ERR(bdev)); 1491 retcode = ERR_OPEN_MD_DISK; 1492 goto fail; 1493 } 1494 nbc->md_bdev = bdev; 1495 1496 if ((nbc->backing_bdev == nbc->md_bdev) != 1497 (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || 1498 new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { 1499 retcode = ERR_MD_IDX_INVALID; 1500 goto fail; 1501 } 1502 1503 resync_lru = lc_create("resync", drbd_bm_ext_cache, 1504 1, 61, sizeof(struct bm_extent), 1505 offsetof(struct bm_extent, lce)); 1506 if (!resync_lru) { 1507 retcode = ERR_NOMEM; 1508 goto fail; 1509 } 1510 1511 /* Read our meta data super block early. 1512 * This also sets other on-disk offsets. */ 1513 retcode = drbd_md_read(mdev, nbc); 1514 if (retcode != NO_ERROR) 1515 goto fail; 1516 1517 if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) 1518 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; 1519 if (new_disk_conf->al_extents > drbd_al_extents_max(nbc)) 1520 new_disk_conf->al_extents = drbd_al_extents_max(nbc); 1521 1522 if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { 1523 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", 1524 (unsigned long long) drbd_get_max_capacity(nbc), 1525 (unsigned long long) new_disk_conf->disk_size); 1526 retcode = ERR_DISK_TOO_SMALL; 1527 goto fail; 1528 } 1529 1530 if (new_disk_conf->meta_dev_idx < 0) { 1531 max_possible_sectors = DRBD_MAX_SECTORS_FLEX; 1532 /* at least one MB, otherwise it does not make sense */ 1533 min_md_device_sectors = (2<<10); 1534 } else { 1535 max_possible_sectors = DRBD_MAX_SECTORS; 1536 min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1); 1537 } 1538 1539 if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { 1540 retcode = ERR_MD_DISK_TOO_SMALL; 1541 dev_warn(DEV, "refusing attach: md-device too small, " 1542 "at least %llu sectors needed for this meta-disk type\n", 1543 (unsigned long long) min_md_device_sectors); 1544 goto fail; 1545 } 1546 1547 /* Make sure the new disk is big enough 1548 * (we may currently be R_PRIMARY with no local disk...) */ 1549 if (drbd_get_max_capacity(nbc) < 1550 drbd_get_capacity(mdev->this_bdev)) { 1551 retcode = ERR_DISK_TOO_SMALL; 1552 goto fail; 1553 } 1554 1555 nbc->known_size = drbd_get_capacity(nbc->backing_bdev); 1556 1557 if (nbc->known_size > max_possible_sectors) { 1558 dev_warn(DEV, "==> truncating very big lower level device " 1559 "to currently maximum possible %llu sectors <==\n", 1560 (unsigned long long) max_possible_sectors); 1561 if (new_disk_conf->meta_dev_idx >= 0) 1562 dev_warn(DEV, "==>> using internal or flexible " 1563 "meta data may help <<==\n"); 1564 } 1565 1566 drbd_suspend_io(mdev); 1567 /* also wait for the last barrier ack. */ 1568 /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171 1569 * We need a way to either ignore barrier acks for barriers sent before a device 1570 * was attached, or a way to wait for all pending barrier acks to come in. 1571 * As barriers are counted per resource, 1572 * we'd need to suspend io on all devices of a resource. 1573 */ 1574 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || drbd_suspended(mdev)); 1575 /* and for any other previously queued work */ 1576 drbd_flush_workqueue(mdev); 1577 1578 rv = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); 1579 retcode = rv; /* FIXME: Type mismatch. */ 1580 drbd_resume_io(mdev); 1581 if (rv < SS_SUCCESS) 1582 goto fail; 1583 1584 if (!get_ldev_if_state(mdev, D_ATTACHING)) 1585 goto force_diskless; 1586 1587 if (!mdev->bitmap) { 1588 if (drbd_bm_init(mdev)) { 1589 retcode = ERR_NOMEM; 1590 goto force_diskless_dec; 1591 } 1592 } 1593 1594 if (mdev->state.conn < C_CONNECTED && 1595 mdev->state.role == R_PRIMARY && 1596 (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { 1597 dev_err(DEV, "Can only attach to data with current UUID=%016llX\n", 1598 (unsigned long long)mdev->ed_uuid); 1599 retcode = ERR_DATA_NOT_CURRENT; 1600 goto force_diskless_dec; 1601 } 1602 1603 /* Since we are diskless, fix the activity log first... */ 1604 if (drbd_check_al_size(mdev, new_disk_conf)) { 1605 retcode = ERR_NOMEM; 1606 goto force_diskless_dec; 1607 } 1608 1609 /* Prevent shrinking of consistent devices ! */ 1610 if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && 1611 drbd_new_dev_size(mdev, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) { 1612 dev_warn(DEV, "refusing to truncate a consistent device\n"); 1613 retcode = ERR_DISK_TOO_SMALL; 1614 goto force_diskless_dec; 1615 } 1616 1617 /* Reset the "barriers don't work" bits here, then force meta data to 1618 * be written, to ensure we determine if barriers are supported. */ 1619 if (new_disk_conf->md_flushes) 1620 clear_bit(MD_NO_FUA, &mdev->flags); 1621 else 1622 set_bit(MD_NO_FUA, &mdev->flags); 1623 1624 /* Point of no return reached. 1625 * Devices and memory are no longer released by error cleanup below. 1626 * now mdev takes over responsibility, and the state engine should 1627 * clean it up somewhere. */ 1628 D_ASSERT(mdev->ldev == NULL); 1629 mdev->ldev = nbc; 1630 mdev->resync = resync_lru; 1631 mdev->rs_plan_s = new_plan; 1632 nbc = NULL; 1633 resync_lru = NULL; 1634 new_disk_conf = NULL; 1635 new_plan = NULL; 1636 1637 drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush); 1638 1639 if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) 1640 set_bit(CRASHED_PRIMARY, &mdev->flags); 1641 else 1642 clear_bit(CRASHED_PRIMARY, &mdev->flags); 1643 1644 if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && 1645 !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod)) 1646 set_bit(CRASHED_PRIMARY, &mdev->flags); 1647 1648 mdev->send_cnt = 0; 1649 mdev->recv_cnt = 0; 1650 mdev->read_cnt = 0; 1651 mdev->writ_cnt = 0; 1652 1653 drbd_reconsider_max_bio_size(mdev); 1654 1655 /* If I am currently not R_PRIMARY, 1656 * but meta data primary indicator is set, 1657 * I just now recover from a hard crash, 1658 * and have been R_PRIMARY before that crash. 1659 * 1660 * Now, if I had no connection before that crash 1661 * (have been degraded R_PRIMARY), chances are that 1662 * I won't find my peer now either. 1663 * 1664 * In that case, and _only_ in that case, 1665 * we use the degr-wfc-timeout instead of the default, 1666 * so we can automatically recover from a crash of a 1667 * degraded but active "cluster" after a certain timeout. 1668 */ 1669 clear_bit(USE_DEGR_WFC_T, &mdev->flags); 1670 if (mdev->state.role != R_PRIMARY && 1671 drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && 1672 !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) 1673 set_bit(USE_DEGR_WFC_T, &mdev->flags); 1674 1675 dd = drbd_determine_dev_size(mdev, 0, NULL); 1676 if (dd <= DS_ERROR) { 1677 retcode = ERR_NOMEM_BITMAP; 1678 goto force_diskless_dec; 1679 } else if (dd == DS_GREW) 1680 set_bit(RESYNC_AFTER_NEG, &mdev->flags); 1681 1682 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) || 1683 (test_bit(CRASHED_PRIMARY, &mdev->flags) && 1684 drbd_md_test_flag(mdev->ldev, MDF_AL_DISABLED))) { 1685 dev_info(DEV, "Assuming that all blocks are out of sync " 1686 "(aka FullSync)\n"); 1687 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, 1688 "set_n_write from attaching", BM_LOCKED_MASK)) { 1689 retcode = ERR_IO_MD_DISK; 1690 goto force_diskless_dec; 1691 } 1692 } else { 1693 if (drbd_bitmap_io(mdev, &drbd_bm_read, 1694 "read from attaching", BM_LOCKED_MASK)) { 1695 retcode = ERR_IO_MD_DISK; 1696 goto force_diskless_dec; 1697 } 1698 } 1699 1700 if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) 1701 drbd_suspend_al(mdev); /* IO is still suspended here... */ 1702 1703 spin_lock_irq(&mdev->tconn->req_lock); 1704 os = drbd_read_state(mdev); 1705 ns = os; 1706 /* If MDF_CONSISTENT is not set go into inconsistent state, 1707 otherwise investigate MDF_WasUpToDate... 1708 If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, 1709 otherwise into D_CONSISTENT state. 1710 */ 1711 if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) { 1712 if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE)) 1713 ns.disk = D_CONSISTENT; 1714 else 1715 ns.disk = D_OUTDATED; 1716 } else { 1717 ns.disk = D_INCONSISTENT; 1718 } 1719 1720 if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) 1721 ns.pdsk = D_OUTDATED; 1722 1723 rcu_read_lock(); 1724 if (ns.disk == D_CONSISTENT && 1725 (ns.pdsk == D_OUTDATED || rcu_dereference(mdev->ldev->disk_conf)->fencing == FP_DONT_CARE)) 1726 ns.disk = D_UP_TO_DATE; 1727 1728 /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, 1729 MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before 1730 this point, because drbd_request_state() modifies these 1731 flags. */ 1732 1733 if (rcu_dereference(mdev->ldev->disk_conf)->al_updates) 1734 mdev->ldev->md.flags &= ~MDF_AL_DISABLED; 1735 else 1736 mdev->ldev->md.flags |= MDF_AL_DISABLED; 1737 1738 rcu_read_unlock(); 1739 1740 /* In case we are C_CONNECTED postpone any decision on the new disk 1741 state after the negotiation phase. */ 1742 if (mdev->state.conn == C_CONNECTED) { 1743 mdev->new_state_tmp.i = ns.i; 1744 ns.i = os.i; 1745 ns.disk = D_NEGOTIATING; 1746 1747 /* We expect to receive up-to-date UUIDs soon. 1748 To avoid a race in receive_state, free p_uuid while 1749 holding req_lock. I.e. atomic with the state change */ 1750 kfree(mdev->p_uuid); 1751 mdev->p_uuid = NULL; 1752 } 1753 1754 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); 1755 spin_unlock_irq(&mdev->tconn->req_lock); 1756 1757 if (rv < SS_SUCCESS) 1758 goto force_diskless_dec; 1759 1760 mod_timer(&mdev->request_timer, jiffies + HZ); 1761 1762 if (mdev->state.role == R_PRIMARY) 1763 mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; 1764 else 1765 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; 1766 1767 drbd_md_mark_dirty(mdev); 1768 drbd_md_sync(mdev); 1769 1770 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); 1771 put_ldev(mdev); 1772 conn_reconfig_done(mdev->tconn); 1773 drbd_adm_finish(info, retcode); 1774 return 0; 1775 1776 force_diskless_dec: 1777 put_ldev(mdev); 1778 force_diskless: 1779 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 1780 drbd_md_sync(mdev); 1781 fail: 1782 conn_reconfig_done(mdev->tconn); 1783 if (nbc) { 1784 if (nbc->backing_bdev) 1785 blkdev_put(nbc->backing_bdev, 1786 FMODE_READ | FMODE_WRITE | FMODE_EXCL); 1787 if (nbc->md_bdev) 1788 blkdev_put(nbc->md_bdev, 1789 FMODE_READ | FMODE_WRITE | FMODE_EXCL); 1790 kfree(nbc); 1791 } 1792 kfree(new_disk_conf); 1793 lc_destroy(resync_lru); 1794 kfree(new_plan); 1795 1796 finish: 1797 drbd_adm_finish(info, retcode); 1798 return 0; 1799 } 1800 1801 static int adm_detach(struct drbd_conf *mdev, int force) 1802 { 1803 enum drbd_state_rv retcode; 1804 int ret; 1805 1806 if (force) { 1807 set_bit(FORCE_DETACH, &mdev->flags); 1808 drbd_force_state(mdev, NS(disk, D_FAILED)); 1809 retcode = SS_SUCCESS; 1810 goto out; 1811 } 1812 1813 drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ 1814 drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */ 1815 retcode = drbd_request_state(mdev, NS(disk, D_FAILED)); 1816 drbd_md_put_buffer(mdev); 1817 /* D_FAILED will transition to DISKLESS. */ 1818 ret = wait_event_interruptible(mdev->misc_wait, 1819 mdev->state.disk != D_FAILED); 1820 drbd_resume_io(mdev); 1821 if ((int)retcode == (int)SS_IS_DISKLESS) 1822 retcode = SS_NOTHING_TO_DO; 1823 if (ret) 1824 retcode = ERR_INTR; 1825 out: 1826 return retcode; 1827 } 1828 1829 /* Detaching the disk is a process in multiple stages. First we need to lock 1830 * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. 1831 * Then we transition to D_DISKLESS, and wait for put_ldev() to return all 1832 * internal references as well. 1833 * Only then we have finally detached. */ 1834 int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) 1835 { 1836 enum drbd_ret_code retcode; 1837 struct detach_parms parms = { }; 1838 int err; 1839 1840 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 1841 if (!adm_ctx.reply_skb) 1842 return retcode; 1843 if (retcode != NO_ERROR) 1844 goto out; 1845 1846 if (info->attrs[DRBD_NLA_DETACH_PARMS]) { 1847 err = detach_parms_from_attrs(&parms, info); 1848 if (err) { 1849 retcode = ERR_MANDATORY_TAG; 1850 drbd_msg_put_info(from_attrs_err_to_txt(err)); 1851 goto out; 1852 } 1853 } 1854 1855 retcode = adm_detach(adm_ctx.mdev, parms.force_detach); 1856 out: 1857 drbd_adm_finish(info, retcode); 1858 return 0; 1859 } 1860 1861 static bool conn_resync_running(struct drbd_tconn *tconn) 1862 { 1863 struct drbd_conf *mdev; 1864 bool rv = false; 1865 int vnr; 1866 1867 rcu_read_lock(); 1868 idr_for_each_entry(&tconn->volumes, mdev, vnr) { 1869 if (mdev->state.conn == C_SYNC_SOURCE || 1870 mdev->state.conn == C_SYNC_TARGET || 1871 mdev->state.conn == C_PAUSED_SYNC_S || 1872 mdev->state.conn == C_PAUSED_SYNC_T) { 1873 rv = true; 1874 break; 1875 } 1876 } 1877 rcu_read_unlock(); 1878 1879 return rv; 1880 } 1881 1882 static bool conn_ov_running(struct drbd_tconn *tconn) 1883 { 1884 struct drbd_conf *mdev; 1885 bool rv = false; 1886 int vnr; 1887 1888 rcu_read_lock(); 1889 idr_for_each_entry(&tconn->volumes, mdev, vnr) { 1890 if (mdev->state.conn == C_VERIFY_S || 1891 mdev->state.conn == C_VERIFY_T) { 1892 rv = true; 1893 break; 1894 } 1895 } 1896 rcu_read_unlock(); 1897 1898 return rv; 1899 } 1900 1901 static enum drbd_ret_code 1902 _check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf) 1903 { 1904 struct drbd_conf *mdev; 1905 int i; 1906 1907 if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) { 1908 if (new_conf->wire_protocol != old_conf->wire_protocol) 1909 return ERR_NEED_APV_100; 1910 1911 if (new_conf->two_primaries != old_conf->two_primaries) 1912 return ERR_NEED_APV_100; 1913 1914 if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg)) 1915 return ERR_NEED_APV_100; 1916 } 1917 1918 if (!new_conf->two_primaries && 1919 conn_highest_role(tconn) == R_PRIMARY && 1920 conn_highest_peer(tconn) == R_PRIMARY) 1921 return ERR_NEED_ALLOW_TWO_PRI; 1922 1923 if (new_conf->two_primaries && 1924 (new_conf->wire_protocol != DRBD_PROT_C)) 1925 return ERR_NOT_PROTO_C; 1926 1927 idr_for_each_entry(&tconn->volumes, mdev, i) { 1928 if (get_ldev(mdev)) { 1929 enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; 1930 put_ldev(mdev); 1931 if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) 1932 return ERR_STONITH_AND_PROT_A; 1933 } 1934 if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data) 1935 return ERR_DISCARD_IMPOSSIBLE; 1936 } 1937 1938 if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) 1939 return ERR_CONG_NOT_PROTO_A; 1940 1941 return NO_ERROR; 1942 } 1943 1944 static enum drbd_ret_code 1945 check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf) 1946 { 1947 static enum drbd_ret_code rv; 1948 struct drbd_conf *mdev; 1949 int i; 1950 1951 rcu_read_lock(); 1952 rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf); 1953 rcu_read_unlock(); 1954 1955 /* tconn->volumes protected by genl_lock() here */ 1956 idr_for_each_entry(&tconn->volumes, mdev, i) { 1957 if (!mdev->bitmap) { 1958 if(drbd_bm_init(mdev)) 1959 return ERR_NOMEM; 1960 } 1961 } 1962 1963 return rv; 1964 } 1965 1966 struct crypto { 1967 struct crypto_hash *verify_tfm; 1968 struct crypto_hash *csums_tfm; 1969 struct crypto_hash *cram_hmac_tfm; 1970 struct crypto_hash *integrity_tfm; 1971 }; 1972 1973 static int 1974 alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg) 1975 { 1976 if (!tfm_name[0]) 1977 return NO_ERROR; 1978 1979 *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC); 1980 if (IS_ERR(*tfm)) { 1981 *tfm = NULL; 1982 return err_alg; 1983 } 1984 1985 return NO_ERROR; 1986 } 1987 1988 static enum drbd_ret_code 1989 alloc_crypto(struct crypto *crypto, struct net_conf *new_conf) 1990 { 1991 char hmac_name[CRYPTO_MAX_ALG_NAME]; 1992 enum drbd_ret_code rv; 1993 1994 rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg, 1995 ERR_CSUMS_ALG); 1996 if (rv != NO_ERROR) 1997 return rv; 1998 rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg, 1999 ERR_VERIFY_ALG); 2000 if (rv != NO_ERROR) 2001 return rv; 2002 rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg, 2003 ERR_INTEGRITY_ALG); 2004 if (rv != NO_ERROR) 2005 return rv; 2006 if (new_conf->cram_hmac_alg[0] != 0) { 2007 snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", 2008 new_conf->cram_hmac_alg); 2009 2010 rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name, 2011 ERR_AUTH_ALG); 2012 } 2013 2014 return rv; 2015 } 2016 2017 static void free_crypto(struct crypto *crypto) 2018 { 2019 crypto_free_hash(crypto->cram_hmac_tfm); 2020 crypto_free_hash(crypto->integrity_tfm); 2021 crypto_free_hash(crypto->csums_tfm); 2022 crypto_free_hash(crypto->verify_tfm); 2023 } 2024 2025 int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) 2026 { 2027 enum drbd_ret_code retcode; 2028 struct drbd_tconn *tconn; 2029 struct net_conf *old_conf, *new_conf = NULL; 2030 int err; 2031 int ovr; /* online verify running */ 2032 int rsr; /* re-sync running */ 2033 struct crypto crypto = { }; 2034 2035 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); 2036 if (!adm_ctx.reply_skb) 2037 return retcode; 2038 if (retcode != NO_ERROR) 2039 goto out; 2040 2041 tconn = adm_ctx.tconn; 2042 2043 new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); 2044 if (!new_conf) { 2045 retcode = ERR_NOMEM; 2046 goto out; 2047 } 2048 2049 conn_reconfig_start(tconn); 2050 2051 mutex_lock(&tconn->data.mutex); 2052 mutex_lock(&tconn->conf_update); 2053 old_conf = tconn->net_conf; 2054 2055 if (!old_conf) { 2056 drbd_msg_put_info("net conf missing, try connect"); 2057 retcode = ERR_INVALID_REQUEST; 2058 goto fail; 2059 } 2060 2061 *new_conf = *old_conf; 2062 if (should_set_defaults(info)) 2063 set_net_conf_defaults(new_conf); 2064 2065 err = net_conf_from_attrs_for_change(new_conf, info); 2066 if (err && err != -ENOMSG) { 2067 retcode = ERR_MANDATORY_TAG; 2068 drbd_msg_put_info(from_attrs_err_to_txt(err)); 2069 goto fail; 2070 } 2071 2072 retcode = check_net_options(tconn, new_conf); 2073 if (retcode != NO_ERROR) 2074 goto fail; 2075 2076 /* re-sync running */ 2077 rsr = conn_resync_running(tconn); 2078 if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) { 2079 retcode = ERR_CSUMS_RESYNC_RUNNING; 2080 goto fail; 2081 } 2082 2083 /* online verify running */ 2084 ovr = conn_ov_running(tconn); 2085 if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) { 2086 retcode = ERR_VERIFY_RUNNING; 2087 goto fail; 2088 } 2089 2090 retcode = alloc_crypto(&crypto, new_conf); 2091 if (retcode != NO_ERROR) 2092 goto fail; 2093 2094 rcu_assign_pointer(tconn->net_conf, new_conf); 2095 2096 if (!rsr) { 2097 crypto_free_hash(tconn->csums_tfm); 2098 tconn->csums_tfm = crypto.csums_tfm; 2099 crypto.csums_tfm = NULL; 2100 } 2101 if (!ovr) { 2102 crypto_free_hash(tconn->verify_tfm); 2103 tconn->verify_tfm = crypto.verify_tfm; 2104 crypto.verify_tfm = NULL; 2105 } 2106 2107 crypto_free_hash(tconn->integrity_tfm); 2108 tconn->integrity_tfm = crypto.integrity_tfm; 2109 if (tconn->cstate >= C_WF_REPORT_PARAMS && tconn->agreed_pro_version >= 100) 2110 /* Do this without trying to take tconn->data.mutex again. */ 2111 __drbd_send_protocol(tconn, P_PROTOCOL_UPDATE); 2112 2113 crypto_free_hash(tconn->cram_hmac_tfm); 2114 tconn->cram_hmac_tfm = crypto.cram_hmac_tfm; 2115 2116 mutex_unlock(&tconn->conf_update); 2117 mutex_unlock(&tconn->data.mutex); 2118 synchronize_rcu(); 2119 kfree(old_conf); 2120 2121 if (tconn->cstate >= C_WF_REPORT_PARAMS) 2122 drbd_send_sync_param(minor_to_mdev(conn_lowest_minor(tconn))); 2123 2124 goto done; 2125 2126 fail: 2127 mutex_unlock(&tconn->conf_update); 2128 mutex_unlock(&tconn->data.mutex); 2129 free_crypto(&crypto); 2130 kfree(new_conf); 2131 done: 2132 conn_reconfig_done(tconn); 2133 out: 2134 drbd_adm_finish(info, retcode); 2135 return 0; 2136 } 2137 2138 int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) 2139 { 2140 struct drbd_conf *mdev; 2141 struct net_conf *old_conf, *new_conf = NULL; 2142 struct crypto crypto = { }; 2143 struct drbd_tconn *tconn; 2144 enum drbd_ret_code retcode; 2145 int i; 2146 int err; 2147 2148 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); 2149 2150 if (!adm_ctx.reply_skb) 2151 return retcode; 2152 if (retcode != NO_ERROR) 2153 goto out; 2154 if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { 2155 drbd_msg_put_info("connection endpoint(s) missing"); 2156 retcode = ERR_INVALID_REQUEST; 2157 goto out; 2158 } 2159 2160 /* No need for _rcu here. All reconfiguration is 2161 * strictly serialized on genl_lock(). We are protected against 2162 * concurrent reconfiguration/addition/deletion */ 2163 list_for_each_entry(tconn, &drbd_tconns, all_tconn) { 2164 if (nla_len(adm_ctx.my_addr) == tconn->my_addr_len && 2165 !memcmp(nla_data(adm_ctx.my_addr), &tconn->my_addr, tconn->my_addr_len)) { 2166 retcode = ERR_LOCAL_ADDR; 2167 goto out; 2168 } 2169 2170 if (nla_len(adm_ctx.peer_addr) == tconn->peer_addr_len && 2171 !memcmp(nla_data(adm_ctx.peer_addr), &tconn->peer_addr, tconn->peer_addr_len)) { 2172 retcode = ERR_PEER_ADDR; 2173 goto out; 2174 } 2175 } 2176 2177 tconn = adm_ctx.tconn; 2178 conn_reconfig_start(tconn); 2179 2180 if (tconn->cstate > C_STANDALONE) { 2181 retcode = ERR_NET_CONFIGURED; 2182 goto fail; 2183 } 2184 2185 /* allocation not in the IO path, drbdsetup / netlink process context */ 2186 new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL); 2187 if (!new_conf) { 2188 retcode = ERR_NOMEM; 2189 goto fail; 2190 } 2191 2192 set_net_conf_defaults(new_conf); 2193 2194 err = net_conf_from_attrs(new_conf, info); 2195 if (err && err != -ENOMSG) { 2196 retcode = ERR_MANDATORY_TAG; 2197 drbd_msg_put_info(from_attrs_err_to_txt(err)); 2198 goto fail; 2199 } 2200 2201 retcode = check_net_options(tconn, new_conf); 2202 if (retcode != NO_ERROR) 2203 goto fail; 2204 2205 retcode = alloc_crypto(&crypto, new_conf); 2206 if (retcode != NO_ERROR) 2207 goto fail; 2208 2209 ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; 2210 2211 conn_flush_workqueue(tconn); 2212 2213 mutex_lock(&tconn->conf_update); 2214 old_conf = tconn->net_conf; 2215 if (old_conf) { 2216 retcode = ERR_NET_CONFIGURED; 2217 mutex_unlock(&tconn->conf_update); 2218 goto fail; 2219 } 2220 rcu_assign_pointer(tconn->net_conf, new_conf); 2221 2222 conn_free_crypto(tconn); 2223 tconn->cram_hmac_tfm = crypto.cram_hmac_tfm; 2224 tconn->integrity_tfm = crypto.integrity_tfm; 2225 tconn->csums_tfm = crypto.csums_tfm; 2226 tconn->verify_tfm = crypto.verify_tfm; 2227 2228 tconn->my_addr_len = nla_len(adm_ctx.my_addr); 2229 memcpy(&tconn->my_addr, nla_data(adm_ctx.my_addr), tconn->my_addr_len); 2230 tconn->peer_addr_len = nla_len(adm_ctx.peer_addr); 2231 memcpy(&tconn->peer_addr, nla_data(adm_ctx.peer_addr), tconn->peer_addr_len); 2232 2233 mutex_unlock(&tconn->conf_update); 2234 2235 rcu_read_lock(); 2236 idr_for_each_entry(&tconn->volumes, mdev, i) { 2237 mdev->send_cnt = 0; 2238 mdev->recv_cnt = 0; 2239 } 2240 rcu_read_unlock(); 2241 2242 retcode = conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE); 2243 2244 conn_reconfig_done(tconn); 2245 drbd_adm_finish(info, retcode); 2246 return 0; 2247 2248 fail: 2249 free_crypto(&crypto); 2250 kfree(new_conf); 2251 2252 conn_reconfig_done(tconn); 2253 out: 2254 drbd_adm_finish(info, retcode); 2255 return 0; 2256 } 2257 2258 static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool force) 2259 { 2260 enum drbd_state_rv rv; 2261 2262 rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), 2263 force ? CS_HARD : 0); 2264 2265 switch (rv) { 2266 case SS_NOTHING_TO_DO: 2267 break; 2268 case SS_ALREADY_STANDALONE: 2269 return SS_SUCCESS; 2270 case SS_PRIMARY_NOP: 2271 /* Our state checking code wants to see the peer outdated. */ 2272 rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0); 2273 2274 if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */ 2275 rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_VERBOSE); 2276 2277 break; 2278 case SS_CW_FAILED_BY_PEER: 2279 /* The peer probably wants to see us outdated. */ 2280 rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, 2281 disk, D_OUTDATED), 0); 2282 if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) { 2283 rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), 2284 CS_HARD); 2285 } 2286 break; 2287 default:; 2288 /* no special handling necessary */ 2289 } 2290 2291 if (rv >= SS_SUCCESS) { 2292 enum drbd_state_rv rv2; 2293 /* No one else can reconfigure the network while I am here. 2294 * The state handling only uses drbd_thread_stop_nowait(), 2295 * we want to really wait here until the receiver is no more. 2296 */ 2297 drbd_thread_stop(&adm_ctx.tconn->receiver); 2298 2299 /* Race breaker. This additional state change request may be 2300 * necessary, if this was a forced disconnect during a receiver 2301 * restart. We may have "killed" the receiver thread just 2302 * after drbdd_init() returned. Typically, we should be 2303 * C_STANDALONE already, now, and this becomes a no-op. 2304 */ 2305 rv2 = conn_request_state(tconn, NS(conn, C_STANDALONE), 2306 CS_VERBOSE | CS_HARD); 2307 if (rv2 < SS_SUCCESS) 2308 conn_err(tconn, 2309 "unexpected rv2=%d in conn_try_disconnect()\n", 2310 rv2); 2311 } 2312 return rv; 2313 } 2314 2315 int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) 2316 { 2317 struct disconnect_parms parms; 2318 struct drbd_tconn *tconn; 2319 enum drbd_state_rv rv; 2320 enum drbd_ret_code retcode; 2321 int err; 2322 2323 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); 2324 if (!adm_ctx.reply_skb) 2325 return retcode; 2326 if (retcode != NO_ERROR) 2327 goto fail; 2328 2329 tconn = adm_ctx.tconn; 2330 memset(&parms, 0, sizeof(parms)); 2331 if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) { 2332 err = disconnect_parms_from_attrs(&parms, info); 2333 if (err) { 2334 retcode = ERR_MANDATORY_TAG; 2335 drbd_msg_put_info(from_attrs_err_to_txt(err)); 2336 goto fail; 2337 } 2338 } 2339 2340 rv = conn_try_disconnect(tconn, parms.force_disconnect); 2341 if (rv < SS_SUCCESS) 2342 retcode = rv; /* FIXME: Type mismatch. */ 2343 else 2344 retcode = NO_ERROR; 2345 fail: 2346 drbd_adm_finish(info, retcode); 2347 return 0; 2348 } 2349 2350 void resync_after_online_grow(struct drbd_conf *mdev) 2351 { 2352 int iass; /* I am sync source */ 2353 2354 dev_info(DEV, "Resync of new storage after online grow\n"); 2355 if (mdev->state.role != mdev->state.peer) 2356 iass = (mdev->state.role == R_PRIMARY); 2357 else 2358 iass = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags); 2359 2360 if (iass) 2361 drbd_start_resync(mdev, C_SYNC_SOURCE); 2362 else 2363 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); 2364 } 2365 2366 int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) 2367 { 2368 struct disk_conf *old_disk_conf, *new_disk_conf = NULL; 2369 struct resize_parms rs; 2370 struct drbd_conf *mdev; 2371 enum drbd_ret_code retcode; 2372 enum determine_dev_size dd; 2373 bool change_al_layout = false; 2374 enum dds_flags ddsf; 2375 sector_t u_size; 2376 int err; 2377 2378 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2379 if (!adm_ctx.reply_skb) 2380 return retcode; 2381 if (retcode != NO_ERROR) 2382 goto fail; 2383 2384 mdev = adm_ctx.mdev; 2385 if (!get_ldev(mdev)) { 2386 retcode = ERR_NO_DISK; 2387 goto fail; 2388 } 2389 2390 memset(&rs, 0, sizeof(struct resize_parms)); 2391 rs.al_stripes = mdev->ldev->md.al_stripes; 2392 rs.al_stripe_size = mdev->ldev->md.al_stripe_size_4k * 4; 2393 if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { 2394 err = resize_parms_from_attrs(&rs, info); 2395 if (err) { 2396 retcode = ERR_MANDATORY_TAG; 2397 drbd_msg_put_info(from_attrs_err_to_txt(err)); 2398 goto fail_ldev; 2399 } 2400 } 2401 2402 if (mdev->state.conn > C_CONNECTED) { 2403 retcode = ERR_RESIZE_RESYNC; 2404 goto fail_ldev; 2405 } 2406 2407 if (mdev->state.role == R_SECONDARY && 2408 mdev->state.peer == R_SECONDARY) { 2409 retcode = ERR_NO_PRIMARY; 2410 goto fail_ldev; 2411 } 2412 2413 if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { 2414 retcode = ERR_NEED_APV_93; 2415 goto fail_ldev; 2416 } 2417 2418 rcu_read_lock(); 2419 u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; 2420 rcu_read_unlock(); 2421 if (u_size != (sector_t)rs.resize_size) { 2422 new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); 2423 if (!new_disk_conf) { 2424 retcode = ERR_NOMEM; 2425 goto fail_ldev; 2426 } 2427 } 2428 2429 if (mdev->ldev->md.al_stripes != rs.al_stripes || 2430 mdev->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) { 2431 u32 al_size_k = rs.al_stripes * rs.al_stripe_size; 2432 2433 if (al_size_k > (16 * 1024 * 1024)) { 2434 retcode = ERR_MD_LAYOUT_TOO_BIG; 2435 goto fail_ldev; 2436 } 2437 2438 if (al_size_k < MD_32kB_SECT/2) { 2439 retcode = ERR_MD_LAYOUT_TOO_SMALL; 2440 goto fail_ldev; 2441 } 2442 2443 if (mdev->state.conn != C_CONNECTED) { 2444 retcode = ERR_MD_LAYOUT_CONNECTED; 2445 goto fail_ldev; 2446 } 2447 2448 change_al_layout = true; 2449 } 2450 2451 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) 2452 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); 2453 2454 if (new_disk_conf) { 2455 mutex_lock(&mdev->tconn->conf_update); 2456 old_disk_conf = mdev->ldev->disk_conf; 2457 *new_disk_conf = *old_disk_conf; 2458 new_disk_conf->disk_size = (sector_t)rs.resize_size; 2459 rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); 2460 mutex_unlock(&mdev->tconn->conf_update); 2461 synchronize_rcu(); 2462 kfree(old_disk_conf); 2463 } 2464 2465 ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); 2466 dd = drbd_determine_dev_size(mdev, ddsf, change_al_layout ? &rs : NULL); 2467 drbd_md_sync(mdev); 2468 put_ldev(mdev); 2469 if (dd == DS_ERROR) { 2470 retcode = ERR_NOMEM_BITMAP; 2471 goto fail; 2472 } else if (dd == DS_ERROR_SPACE_MD) { 2473 retcode = ERR_MD_LAYOUT_NO_FIT; 2474 goto fail; 2475 } else if (dd == DS_ERROR_SHRINK) { 2476 retcode = ERR_IMPLICIT_SHRINK; 2477 goto fail; 2478 } 2479 2480 if (mdev->state.conn == C_CONNECTED) { 2481 if (dd == DS_GREW) 2482 set_bit(RESIZE_PENDING, &mdev->flags); 2483 2484 drbd_send_uuids(mdev); 2485 drbd_send_sizes(mdev, 1, ddsf); 2486 } 2487 2488 fail: 2489 drbd_adm_finish(info, retcode); 2490 return 0; 2491 2492 fail_ldev: 2493 put_ldev(mdev); 2494 goto fail; 2495 } 2496 2497 int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) 2498 { 2499 enum drbd_ret_code retcode; 2500 struct drbd_tconn *tconn; 2501 struct res_opts res_opts; 2502 int err; 2503 2504 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); 2505 if (!adm_ctx.reply_skb) 2506 return retcode; 2507 if (retcode != NO_ERROR) 2508 goto fail; 2509 tconn = adm_ctx.tconn; 2510 2511 res_opts = tconn->res_opts; 2512 if (should_set_defaults(info)) 2513 set_res_opts_defaults(&res_opts); 2514 2515 err = res_opts_from_attrs(&res_opts, info); 2516 if (err && err != -ENOMSG) { 2517 retcode = ERR_MANDATORY_TAG; 2518 drbd_msg_put_info(from_attrs_err_to_txt(err)); 2519 goto fail; 2520 } 2521 2522 err = set_resource_options(tconn, &res_opts); 2523 if (err) { 2524 retcode = ERR_INVALID_REQUEST; 2525 if (err == -ENOMEM) 2526 retcode = ERR_NOMEM; 2527 } 2528 2529 fail: 2530 drbd_adm_finish(info, retcode); 2531 return 0; 2532 } 2533 2534 int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) 2535 { 2536 struct drbd_conf *mdev; 2537 int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ 2538 2539 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2540 if (!adm_ctx.reply_skb) 2541 return retcode; 2542 if (retcode != NO_ERROR) 2543 goto out; 2544 2545 mdev = adm_ctx.mdev; 2546 2547 /* If there is still bitmap IO pending, probably because of a previous 2548 * resync just being finished, wait for it before requesting a new resync. 2549 * Also wait for it's after_state_ch(). */ 2550 drbd_suspend_io(mdev); 2551 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); 2552 drbd_flush_workqueue(mdev); 2553 2554 /* If we happen to be C_STANDALONE R_SECONDARY, just change to 2555 * D_INCONSISTENT, and set all bits in the bitmap. Otherwise, 2556 * try to start a resync handshake as sync target for full sync. 2557 */ 2558 if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) { 2559 retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT)); 2560 if (retcode >= SS_SUCCESS) { 2561 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, 2562 "set_n_write from invalidate", BM_LOCKED_MASK)) 2563 retcode = ERR_IO_MD_DISK; 2564 } 2565 } else 2566 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); 2567 drbd_resume_io(mdev); 2568 2569 out: 2570 drbd_adm_finish(info, retcode); 2571 return 0; 2572 } 2573 2574 static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, 2575 union drbd_state mask, union drbd_state val) 2576 { 2577 enum drbd_ret_code retcode; 2578 2579 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2580 if (!adm_ctx.reply_skb) 2581 return retcode; 2582 if (retcode != NO_ERROR) 2583 goto out; 2584 2585 retcode = drbd_request_state(adm_ctx.mdev, mask, val); 2586 out: 2587 drbd_adm_finish(info, retcode); 2588 return 0; 2589 } 2590 2591 static int drbd_bmio_set_susp_al(struct drbd_conf *mdev) 2592 { 2593 int rv; 2594 2595 rv = drbd_bmio_set_n_write(mdev); 2596 drbd_suspend_al(mdev); 2597 return rv; 2598 } 2599 2600 int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) 2601 { 2602 int retcode; /* drbd_ret_code, drbd_state_rv */ 2603 struct drbd_conf *mdev; 2604 2605 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2606 if (!adm_ctx.reply_skb) 2607 return retcode; 2608 if (retcode != NO_ERROR) 2609 goto out; 2610 2611 mdev = adm_ctx.mdev; 2612 2613 /* If there is still bitmap IO pending, probably because of a previous 2614 * resync just being finished, wait for it before requesting a new resync. 2615 * Also wait for it's after_state_ch(). */ 2616 drbd_suspend_io(mdev); 2617 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); 2618 drbd_flush_workqueue(mdev); 2619 2620 /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits 2621 * in the bitmap. Otherwise, try to start a resync handshake 2622 * as sync source for full sync. 2623 */ 2624 if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) { 2625 /* The peer will get a resync upon connect anyways. Just make that 2626 into a full resync. */ 2627 retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); 2628 if (retcode >= SS_SUCCESS) { 2629 if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, 2630 "set_n_write from invalidate_peer", 2631 BM_LOCKED_SET_ALLOWED)) 2632 retcode = ERR_IO_MD_DISK; 2633 } 2634 } else 2635 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); 2636 drbd_resume_io(mdev); 2637 2638 out: 2639 drbd_adm_finish(info, retcode); 2640 return 0; 2641 } 2642 2643 int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) 2644 { 2645 enum drbd_ret_code retcode; 2646 2647 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2648 if (!adm_ctx.reply_skb) 2649 return retcode; 2650 if (retcode != NO_ERROR) 2651 goto out; 2652 2653 if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) 2654 retcode = ERR_PAUSE_IS_SET; 2655 out: 2656 drbd_adm_finish(info, retcode); 2657 return 0; 2658 } 2659 2660 int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) 2661 { 2662 union drbd_dev_state s; 2663 enum drbd_ret_code retcode; 2664 2665 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2666 if (!adm_ctx.reply_skb) 2667 return retcode; 2668 if (retcode != NO_ERROR) 2669 goto out; 2670 2671 if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { 2672 s = adm_ctx.mdev->state; 2673 if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { 2674 retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : 2675 s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; 2676 } else { 2677 retcode = ERR_PAUSE_IS_CLEAR; 2678 } 2679 } 2680 2681 out: 2682 drbd_adm_finish(info, retcode); 2683 return 0; 2684 } 2685 2686 int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) 2687 { 2688 return drbd_adm_simple_request_state(skb, info, NS(susp, 1)); 2689 } 2690 2691 int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) 2692 { 2693 struct drbd_conf *mdev; 2694 int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ 2695 2696 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2697 if (!adm_ctx.reply_skb) 2698 return retcode; 2699 if (retcode != NO_ERROR) 2700 goto out; 2701 2702 mdev = adm_ctx.mdev; 2703 if (test_bit(NEW_CUR_UUID, &mdev->flags)) { 2704 drbd_uuid_new_current(mdev); 2705 clear_bit(NEW_CUR_UUID, &mdev->flags); 2706 } 2707 drbd_suspend_io(mdev); 2708 retcode = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); 2709 if (retcode == SS_SUCCESS) { 2710 if (mdev->state.conn < C_CONNECTED) 2711 tl_clear(mdev->tconn); 2712 if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED) 2713 tl_restart(mdev->tconn, FAIL_FROZEN_DISK_IO); 2714 } 2715 drbd_resume_io(mdev); 2716 2717 out: 2718 drbd_adm_finish(info, retcode); 2719 return 0; 2720 } 2721 2722 int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info) 2723 { 2724 return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED)); 2725 } 2726 2727 int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr) 2728 { 2729 struct nlattr *nla; 2730 nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT); 2731 if (!nla) 2732 goto nla_put_failure; 2733 if (vnr != VOLUME_UNSPECIFIED && 2734 nla_put_u32(skb, T_ctx_volume, vnr)) 2735 goto nla_put_failure; 2736 if (nla_put_string(skb, T_ctx_resource_name, tconn->name)) 2737 goto nla_put_failure; 2738 if (tconn->my_addr_len && 2739 nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr)) 2740 goto nla_put_failure; 2741 if (tconn->peer_addr_len && 2742 nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr)) 2743 goto nla_put_failure; 2744 nla_nest_end(skb, nla); 2745 return 0; 2746 2747 nla_put_failure: 2748 if (nla) 2749 nla_nest_cancel(skb, nla); 2750 return -EMSGSIZE; 2751 } 2752 2753 int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, 2754 const struct sib_info *sib) 2755 { 2756 struct state_info *si = NULL; /* for sizeof(si->member); */ 2757 struct nlattr *nla; 2758 int got_ldev; 2759 int err = 0; 2760 int exclude_sensitive; 2761 2762 /* If sib != NULL, this is drbd_bcast_event, which anyone can listen 2763 * to. So we better exclude_sensitive information. 2764 * 2765 * If sib == NULL, this is drbd_adm_get_status, executed synchronously 2766 * in the context of the requesting user process. Exclude sensitive 2767 * information, unless current has superuser. 2768 * 2769 * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and 2770 * relies on the current implementation of netlink_dump(), which 2771 * executes the dump callback successively from netlink_recvmsg(), 2772 * always in the context of the receiving process */ 2773 exclude_sensitive = sib || !capable(CAP_SYS_ADMIN); 2774 2775 got_ldev = get_ldev(mdev); 2776 2777 /* We need to add connection name and volume number information still. 2778 * Minor number is in drbd_genlmsghdr. */ 2779 if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr)) 2780 goto nla_put_failure; 2781 2782 if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive)) 2783 goto nla_put_failure; 2784 2785 rcu_read_lock(); 2786 if (got_ldev) { 2787 struct disk_conf *disk_conf; 2788 2789 disk_conf = rcu_dereference(mdev->ldev->disk_conf); 2790 err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive); 2791 } 2792 if (!err) { 2793 struct net_conf *nc; 2794 2795 nc = rcu_dereference(mdev->tconn->net_conf); 2796 if (nc) 2797 err = net_conf_to_skb(skb, nc, exclude_sensitive); 2798 } 2799 rcu_read_unlock(); 2800 if (err) 2801 goto nla_put_failure; 2802 2803 nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO); 2804 if (!nla) 2805 goto nla_put_failure; 2806 if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) || 2807 nla_put_u32(skb, T_current_state, mdev->state.i) || 2808 nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) || 2809 nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) || 2810 nla_put_u64(skb, T_send_cnt, mdev->send_cnt) || 2811 nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) || 2812 nla_put_u64(skb, T_read_cnt, mdev->read_cnt) || 2813 nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) || 2814 nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) || 2815 nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) || 2816 nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) || 2817 nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) || 2818 nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt))) 2819 goto nla_put_failure; 2820 2821 if (got_ldev) { 2822 int err; 2823 2824 spin_lock_irq(&mdev->ldev->md.uuid_lock); 2825 err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid); 2826 spin_unlock_irq(&mdev->ldev->md.uuid_lock); 2827 2828 if (err) 2829 goto nla_put_failure; 2830 2831 if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) || 2832 nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) || 2833 nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev))) 2834 goto nla_put_failure; 2835 if (C_SYNC_SOURCE <= mdev->state.conn && 2836 C_PAUSED_SYNC_T >= mdev->state.conn) { 2837 if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) || 2838 nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed)) 2839 goto nla_put_failure; 2840 } 2841 } 2842 2843 if (sib) { 2844 switch(sib->sib_reason) { 2845 case SIB_SYNC_PROGRESS: 2846 case SIB_GET_STATUS_REPLY: 2847 break; 2848 case SIB_STATE_CHANGE: 2849 if (nla_put_u32(skb, T_prev_state, sib->os.i) || 2850 nla_put_u32(skb, T_new_state, sib->ns.i)) 2851 goto nla_put_failure; 2852 break; 2853 case SIB_HELPER_POST: 2854 if (nla_put_u32(skb, T_helper_exit_code, 2855 sib->helper_exit_code)) 2856 goto nla_put_failure; 2857 /* fall through */ 2858 case SIB_HELPER_PRE: 2859 if (nla_put_string(skb, T_helper, sib->helper_name)) 2860 goto nla_put_failure; 2861 break; 2862 } 2863 } 2864 nla_nest_end(skb, nla); 2865 2866 if (0) 2867 nla_put_failure: 2868 err = -EMSGSIZE; 2869 if (got_ldev) 2870 put_ldev(mdev); 2871 return err; 2872 } 2873 2874 int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) 2875 { 2876 enum drbd_ret_code retcode; 2877 int err; 2878 2879 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2880 if (!adm_ctx.reply_skb) 2881 return retcode; 2882 if (retcode != NO_ERROR) 2883 goto out; 2884 2885 err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL); 2886 if (err) { 2887 nlmsg_free(adm_ctx.reply_skb); 2888 return err; 2889 } 2890 out: 2891 drbd_adm_finish(info, retcode); 2892 return 0; 2893 } 2894 2895 int get_one_status(struct sk_buff *skb, struct netlink_callback *cb) 2896 { 2897 struct drbd_conf *mdev; 2898 struct drbd_genlmsghdr *dh; 2899 struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0]; 2900 struct drbd_tconn *tconn = NULL; 2901 struct drbd_tconn *tmp; 2902 unsigned volume = cb->args[1]; 2903 2904 /* Open coded, deferred, iteration: 2905 * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) { 2906 * idr_for_each_entry(&tconn->volumes, mdev, i) { 2907 * ... 2908 * } 2909 * } 2910 * where tconn is cb->args[0]; 2911 * and i is cb->args[1]; 2912 * 2913 * cb->args[2] indicates if we shall loop over all resources, 2914 * or just dump all volumes of a single resource. 2915 * 2916 * This may miss entries inserted after this dump started, 2917 * or entries deleted before they are reached. 2918 * 2919 * We need to make sure the mdev won't disappear while 2920 * we are looking at it, and revalidate our iterators 2921 * on each iteration. 2922 */ 2923 2924 /* synchronize with conn_create()/conn_destroy() */ 2925 rcu_read_lock(); 2926 /* revalidate iterator position */ 2927 list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) { 2928 if (pos == NULL) { 2929 /* first iteration */ 2930 pos = tmp; 2931 tconn = pos; 2932 break; 2933 } 2934 if (tmp == pos) { 2935 tconn = pos; 2936 break; 2937 } 2938 } 2939 if (tconn) { 2940 next_tconn: 2941 mdev = idr_get_next(&tconn->volumes, &volume); 2942 if (!mdev) { 2943 /* No more volumes to dump on this tconn. 2944 * Advance tconn iterator. */ 2945 pos = list_entry_rcu(tconn->all_tconn.next, 2946 struct drbd_tconn, all_tconn); 2947 /* Did we dump any volume on this tconn yet? */ 2948 if (volume != 0) { 2949 /* If we reached the end of the list, 2950 * or only a single resource dump was requested, 2951 * we are done. */ 2952 if (&pos->all_tconn == &drbd_tconns || cb->args[2]) 2953 goto out; 2954 volume = 0; 2955 tconn = pos; 2956 goto next_tconn; 2957 } 2958 } 2959 2960 dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, 2961 cb->nlh->nlmsg_seq, &drbd_genl_family, 2962 NLM_F_MULTI, DRBD_ADM_GET_STATUS); 2963 if (!dh) 2964 goto out; 2965 2966 if (!mdev) { 2967 /* This is a tconn without a single volume. 2968 * Suprisingly enough, it may have a network 2969 * configuration. */ 2970 struct net_conf *nc; 2971 dh->minor = -1U; 2972 dh->ret_code = NO_ERROR; 2973 if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED)) 2974 goto cancel; 2975 nc = rcu_dereference(tconn->net_conf); 2976 if (nc && net_conf_to_skb(skb, nc, 1) != 0) 2977 goto cancel; 2978 goto done; 2979 } 2980 2981 D_ASSERT(mdev->vnr == volume); 2982 D_ASSERT(mdev->tconn == tconn); 2983 2984 dh->minor = mdev_to_minor(mdev); 2985 dh->ret_code = NO_ERROR; 2986 2987 if (nla_put_status_info(skb, mdev, NULL)) { 2988 cancel: 2989 genlmsg_cancel(skb, dh); 2990 goto out; 2991 } 2992 done: 2993 genlmsg_end(skb, dh); 2994 } 2995 2996 out: 2997 rcu_read_unlock(); 2998 /* where to start the next iteration */ 2999 cb->args[0] = (long)pos; 3000 cb->args[1] = (pos == tconn) ? volume + 1 : 0; 3001 3002 /* No more tconns/volumes/minors found results in an empty skb. 3003 * Which will terminate the dump. */ 3004 return skb->len; 3005 } 3006 3007 /* 3008 * Request status of all resources, or of all volumes within a single resource. 3009 * 3010 * This is a dump, as the answer may not fit in a single reply skb otherwise. 3011 * Which means we cannot use the family->attrbuf or other such members, because 3012 * dump is NOT protected by the genl_lock(). During dump, we only have access 3013 * to the incoming skb, and need to opencode "parsing" of the nlattr payload. 3014 * 3015 * Once things are setup properly, we call into get_one_status(). 3016 */ 3017 int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) 3018 { 3019 const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; 3020 struct nlattr *nla; 3021 const char *resource_name; 3022 struct drbd_tconn *tconn; 3023 int maxtype; 3024 3025 /* Is this a followup call? */ 3026 if (cb->args[0]) { 3027 /* ... of a single resource dump, 3028 * and the resource iterator has been advanced already? */ 3029 if (cb->args[2] && cb->args[2] != cb->args[0]) 3030 return 0; /* DONE. */ 3031 goto dump; 3032 } 3033 3034 /* First call (from netlink_dump_start). We need to figure out 3035 * which resource(s) the user wants us to dump. */ 3036 nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen), 3037 nlmsg_attrlen(cb->nlh, hdrlen), 3038 DRBD_NLA_CFG_CONTEXT); 3039 3040 /* No explicit context given. Dump all. */ 3041 if (!nla) 3042 goto dump; 3043 maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; 3044 nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name)); 3045 if (IS_ERR(nla)) 3046 return PTR_ERR(nla); 3047 /* context given, but no name present? */ 3048 if (!nla) 3049 return -EINVAL; 3050 resource_name = nla_data(nla); 3051 tconn = conn_get_by_name(resource_name); 3052 3053 if (!tconn) 3054 return -ENODEV; 3055 3056 kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */ 3057 3058 /* prime iterators, and set "filter" mode mark: 3059 * only dump this tconn. */ 3060 cb->args[0] = (long)tconn; 3061 /* cb->args[1] = 0; passed in this way. */ 3062 cb->args[2] = (long)tconn; 3063 3064 dump: 3065 return get_one_status(skb, cb); 3066 } 3067 3068 int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) 3069 { 3070 enum drbd_ret_code retcode; 3071 struct timeout_parms tp; 3072 int err; 3073 3074 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 3075 if (!adm_ctx.reply_skb) 3076 return retcode; 3077 if (retcode != NO_ERROR) 3078 goto out; 3079 3080 tp.timeout_type = 3081 adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : 3082 test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED : 3083 UT_DEFAULT; 3084 3085 err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp); 3086 if (err) { 3087 nlmsg_free(adm_ctx.reply_skb); 3088 return err; 3089 } 3090 out: 3091 drbd_adm_finish(info, retcode); 3092 return 0; 3093 } 3094 3095 int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) 3096 { 3097 struct drbd_conf *mdev; 3098 enum drbd_ret_code retcode; 3099 struct start_ov_parms parms; 3100 3101 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 3102 if (!adm_ctx.reply_skb) 3103 return retcode; 3104 if (retcode != NO_ERROR) 3105 goto out; 3106 3107 mdev = adm_ctx.mdev; 3108 3109 /* resume from last known position, if possible */ 3110 parms.ov_start_sector = mdev->ov_start_sector; 3111 parms.ov_stop_sector = ULLONG_MAX; 3112 if (info->attrs[DRBD_NLA_START_OV_PARMS]) { 3113 int err = start_ov_parms_from_attrs(&parms, info); 3114 if (err) { 3115 retcode = ERR_MANDATORY_TAG; 3116 drbd_msg_put_info(from_attrs_err_to_txt(err)); 3117 goto out; 3118 } 3119 } 3120 /* w_make_ov_request expects position to be aligned */ 3121 mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); 3122 mdev->ov_stop_sector = parms.ov_stop_sector; 3123 3124 /* If there is still bitmap IO pending, e.g. previous resync or verify 3125 * just being finished, wait for it before requesting a new resync. */ 3126 drbd_suspend_io(mdev); 3127 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); 3128 retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); 3129 drbd_resume_io(mdev); 3130 out: 3131 drbd_adm_finish(info, retcode); 3132 return 0; 3133 } 3134 3135 3136 int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) 3137 { 3138 struct drbd_conf *mdev; 3139 enum drbd_ret_code retcode; 3140 int skip_initial_sync = 0; 3141 int err; 3142 struct new_c_uuid_parms args; 3143 3144 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 3145 if (!adm_ctx.reply_skb) 3146 return retcode; 3147 if (retcode != NO_ERROR) 3148 goto out_nolock; 3149 3150 mdev = adm_ctx.mdev; 3151 memset(&args, 0, sizeof(args)); 3152 if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) { 3153 err = new_c_uuid_parms_from_attrs(&args, info); 3154 if (err) { 3155 retcode = ERR_MANDATORY_TAG; 3156 drbd_msg_put_info(from_attrs_err_to_txt(err)); 3157 goto out_nolock; 3158 } 3159 } 3160 3161 mutex_lock(mdev->state_mutex); /* Protects us against serialized state changes. */ 3162 3163 if (!get_ldev(mdev)) { 3164 retcode = ERR_NO_DISK; 3165 goto out; 3166 } 3167 3168 /* this is "skip initial sync", assume to be clean */ 3169 if (mdev->state.conn == C_CONNECTED && mdev->tconn->agreed_pro_version >= 90 && 3170 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { 3171 dev_info(DEV, "Preparing to skip initial sync\n"); 3172 skip_initial_sync = 1; 3173 } else if (mdev->state.conn != C_STANDALONE) { 3174 retcode = ERR_CONNECTED; 3175 goto out_dec; 3176 } 3177 3178 drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */ 3179 drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */ 3180 3181 if (args.clear_bm) { 3182 err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, 3183 "clear_n_write from new_c_uuid", BM_LOCKED_MASK); 3184 if (err) { 3185 dev_err(DEV, "Writing bitmap failed with %d\n",err); 3186 retcode = ERR_IO_MD_DISK; 3187 } 3188 if (skip_initial_sync) { 3189 drbd_send_uuids_skip_initial_sync(mdev); 3190 _drbd_uuid_set(mdev, UI_BITMAP, 0); 3191 drbd_print_uuids(mdev, "cleared bitmap UUID"); 3192 spin_lock_irq(&mdev->tconn->req_lock); 3193 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), 3194 CS_VERBOSE, NULL); 3195 spin_unlock_irq(&mdev->tconn->req_lock); 3196 } 3197 } 3198 3199 drbd_md_sync(mdev); 3200 out_dec: 3201 put_ldev(mdev); 3202 out: 3203 mutex_unlock(mdev->state_mutex); 3204 out_nolock: 3205 drbd_adm_finish(info, retcode); 3206 return 0; 3207 } 3208 3209 static enum drbd_ret_code 3210 drbd_check_resource_name(const char *name) 3211 { 3212 if (!name || !name[0]) { 3213 drbd_msg_put_info("resource name missing"); 3214 return ERR_MANDATORY_TAG; 3215 } 3216 /* if we want to use these in sysfs/configfs/debugfs some day, 3217 * we must not allow slashes */ 3218 if (strchr(name, '/')) { 3219 drbd_msg_put_info("invalid resource name"); 3220 return ERR_INVALID_REQUEST; 3221 } 3222 return NO_ERROR; 3223 } 3224 3225 int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) 3226 { 3227 enum drbd_ret_code retcode; 3228 struct res_opts res_opts; 3229 int err; 3230 3231 retcode = drbd_adm_prepare(skb, info, 0); 3232 if (!adm_ctx.reply_skb) 3233 return retcode; 3234 if (retcode != NO_ERROR) 3235 goto out; 3236 3237 set_res_opts_defaults(&res_opts); 3238 err = res_opts_from_attrs(&res_opts, info); 3239 if (err && err != -ENOMSG) { 3240 retcode = ERR_MANDATORY_TAG; 3241 drbd_msg_put_info(from_attrs_err_to_txt(err)); 3242 goto out; 3243 } 3244 3245 retcode = drbd_check_resource_name(adm_ctx.resource_name); 3246 if (retcode != NO_ERROR) 3247 goto out; 3248 3249 if (adm_ctx.tconn) { 3250 if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { 3251 retcode = ERR_INVALID_REQUEST; 3252 drbd_msg_put_info("resource exists"); 3253 } 3254 /* else: still NO_ERROR */ 3255 goto out; 3256 } 3257 3258 if (!conn_create(adm_ctx.resource_name, &res_opts)) 3259 retcode = ERR_NOMEM; 3260 out: 3261 drbd_adm_finish(info, retcode); 3262 return 0; 3263 } 3264 3265 int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info) 3266 { 3267 struct drbd_genlmsghdr *dh = info->userhdr; 3268 enum drbd_ret_code retcode; 3269 3270 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); 3271 if (!adm_ctx.reply_skb) 3272 return retcode; 3273 if (retcode != NO_ERROR) 3274 goto out; 3275 3276 if (dh->minor > MINORMASK) { 3277 drbd_msg_put_info("requested minor out of range"); 3278 retcode = ERR_INVALID_REQUEST; 3279 goto out; 3280 } 3281 if (adm_ctx.volume > DRBD_VOLUME_MAX) { 3282 drbd_msg_put_info("requested volume id out of range"); 3283 retcode = ERR_INVALID_REQUEST; 3284 goto out; 3285 } 3286 3287 /* drbd_adm_prepare made sure already 3288 * that mdev->tconn and mdev->vnr match the request. */ 3289 if (adm_ctx.mdev) { 3290 if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) 3291 retcode = ERR_MINOR_EXISTS; 3292 /* else: still NO_ERROR */ 3293 goto out; 3294 } 3295 3296 retcode = conn_new_minor(adm_ctx.tconn, dh->minor, adm_ctx.volume); 3297 out: 3298 drbd_adm_finish(info, retcode); 3299 return 0; 3300 } 3301 3302 static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev) 3303 { 3304 if (mdev->state.disk == D_DISKLESS && 3305 /* no need to be mdev->state.conn == C_STANDALONE && 3306 * we may want to delete a minor from a live replication group. 3307 */ 3308 mdev->state.role == R_SECONDARY) { 3309 _drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS), 3310 CS_VERBOSE + CS_WAIT_COMPLETE); 3311 idr_remove(&mdev->tconn->volumes, mdev->vnr); 3312 idr_remove(&minors, mdev_to_minor(mdev)); 3313 destroy_workqueue(mdev->submit.wq); 3314 del_gendisk(mdev->vdisk); 3315 synchronize_rcu(); 3316 kref_put(&mdev->kref, &drbd_minor_destroy); 3317 return NO_ERROR; 3318 } else 3319 return ERR_MINOR_CONFIGURED; 3320 } 3321 3322 int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info) 3323 { 3324 enum drbd_ret_code retcode; 3325 3326 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 3327 if (!adm_ctx.reply_skb) 3328 return retcode; 3329 if (retcode != NO_ERROR) 3330 goto out; 3331 3332 retcode = adm_delete_minor(adm_ctx.mdev); 3333 out: 3334 drbd_adm_finish(info, retcode); 3335 return 0; 3336 } 3337 3338 int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) 3339 { 3340 int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ 3341 struct drbd_conf *mdev; 3342 unsigned i; 3343 3344 retcode = drbd_adm_prepare(skb, info, 0); 3345 if (!adm_ctx.reply_skb) 3346 return retcode; 3347 if (retcode != NO_ERROR) 3348 goto out; 3349 3350 if (!adm_ctx.tconn) { 3351 retcode = ERR_RES_NOT_KNOWN; 3352 goto out; 3353 } 3354 3355 /* demote */ 3356 idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { 3357 retcode = drbd_set_role(mdev, R_SECONDARY, 0); 3358 if (retcode < SS_SUCCESS) { 3359 drbd_msg_put_info("failed to demote"); 3360 goto out; 3361 } 3362 } 3363 3364 retcode = conn_try_disconnect(adm_ctx.tconn, 0); 3365 if (retcode < SS_SUCCESS) { 3366 drbd_msg_put_info("failed to disconnect"); 3367 goto out; 3368 } 3369 3370 /* detach */ 3371 idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { 3372 retcode = adm_detach(mdev, 0); 3373 if (retcode < SS_SUCCESS || retcode > NO_ERROR) { 3374 drbd_msg_put_info("failed to detach"); 3375 goto out; 3376 } 3377 } 3378 3379 /* If we reach this, all volumes (of this tconn) are Secondary, 3380 * Disconnected, Diskless, aka Unconfigured. Make sure all threads have 3381 * actually stopped, state handling only does drbd_thread_stop_nowait(). */ 3382 drbd_thread_stop(&adm_ctx.tconn->worker); 3383 3384 /* Now, nothing can fail anymore */ 3385 3386 /* delete volumes */ 3387 idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { 3388 retcode = adm_delete_minor(mdev); 3389 if (retcode != NO_ERROR) { 3390 /* "can not happen" */ 3391 drbd_msg_put_info("failed to delete volume"); 3392 goto out; 3393 } 3394 } 3395 3396 /* delete connection */ 3397 if (conn_lowest_minor(adm_ctx.tconn) < 0) { 3398 list_del_rcu(&adm_ctx.tconn->all_tconn); 3399 synchronize_rcu(); 3400 kref_put(&adm_ctx.tconn->kref, &conn_destroy); 3401 3402 retcode = NO_ERROR; 3403 } else { 3404 /* "can not happen" */ 3405 retcode = ERR_RES_IN_USE; 3406 drbd_msg_put_info("failed to delete connection"); 3407 } 3408 goto out; 3409 out: 3410 drbd_adm_finish(info, retcode); 3411 return 0; 3412 } 3413 3414 int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) 3415 { 3416 enum drbd_ret_code retcode; 3417 3418 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); 3419 if (!adm_ctx.reply_skb) 3420 return retcode; 3421 if (retcode != NO_ERROR) 3422 goto out; 3423 3424 if (conn_lowest_minor(adm_ctx.tconn) < 0) { 3425 list_del_rcu(&adm_ctx.tconn->all_tconn); 3426 synchronize_rcu(); 3427 kref_put(&adm_ctx.tconn->kref, &conn_destroy); 3428 3429 retcode = NO_ERROR; 3430 } else { 3431 retcode = ERR_RES_IN_USE; 3432 } 3433 3434 if (retcode == NO_ERROR) 3435 drbd_thread_stop(&adm_ctx.tconn->worker); 3436 out: 3437 drbd_adm_finish(info, retcode); 3438 return 0; 3439 } 3440 3441 void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib) 3442 { 3443 static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ 3444 struct sk_buff *msg; 3445 struct drbd_genlmsghdr *d_out; 3446 unsigned seq; 3447 int err = -ENOMEM; 3448 3449 if (sib->sib_reason == SIB_SYNC_PROGRESS) { 3450 if (time_after(jiffies, mdev->rs_last_bcast + HZ)) 3451 mdev->rs_last_bcast = jiffies; 3452 else 3453 return; 3454 } 3455 3456 seq = atomic_inc_return(&drbd_genl_seq); 3457 msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); 3458 if (!msg) 3459 goto failed; 3460 3461 err = -EMSGSIZE; 3462 d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT); 3463 if (!d_out) /* cannot happen, but anyways. */ 3464 goto nla_put_failure; 3465 d_out->minor = mdev_to_minor(mdev); 3466 d_out->ret_code = NO_ERROR; 3467 3468 if (nla_put_status_info(msg, mdev, sib)) 3469 goto nla_put_failure; 3470 genlmsg_end(msg, d_out); 3471 err = drbd_genl_multicast_events(msg, 0); 3472 /* msg has been consumed or freed in netlink_broadcast() */ 3473 if (err && err != -ESRCH) 3474 goto failed; 3475 3476 return; 3477 3478 nla_put_failure: 3479 nlmsg_free(msg); 3480 failed: 3481 dev_err(DEV, "Error %d while broadcasting event. " 3482 "Event seq:%u sib_reason:%u\n", 3483 err, seq, sib->sib_reason); 3484 } 3485