1 /*
2  * Generic SCSI-3 ALUA SCSI Device Handler
3  *
4  * Copyright (C) 2007-2010 Hannes Reinecke, SUSE Linux Products GmbH.
5  * All rights reserved.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20  *
21  */
22 #include <linux/slab.h>
23 #include <linux/delay.h>
24 #include <linux/module.h>
25 #include <asm/unaligned.h>
26 #include <scsi/scsi.h>
27 #include <scsi/scsi_proto.h>
28 #include <scsi/scsi_dbg.h>
29 #include <scsi/scsi_eh.h>
30 #include <scsi/scsi_dh.h>
31 
32 #define ALUA_DH_NAME "alua"
33 #define ALUA_DH_VER "2.0"
34 
35 #define TPGS_SUPPORT_NONE		0x00
36 #define TPGS_SUPPORT_OPTIMIZED		0x01
37 #define TPGS_SUPPORT_NONOPTIMIZED	0x02
38 #define TPGS_SUPPORT_STANDBY		0x04
39 #define TPGS_SUPPORT_UNAVAILABLE	0x08
40 #define TPGS_SUPPORT_LBA_DEPENDENT	0x10
41 #define TPGS_SUPPORT_OFFLINE		0x40
42 #define TPGS_SUPPORT_TRANSITION		0x80
43 
44 #define RTPG_FMT_MASK			0x70
45 #define RTPG_FMT_EXT_HDR		0x10
46 
47 #define TPGS_MODE_UNINITIALIZED		 -1
48 #define TPGS_MODE_NONE			0x0
49 #define TPGS_MODE_IMPLICIT		0x1
50 #define TPGS_MODE_EXPLICIT		0x2
51 
52 #define ALUA_RTPG_SIZE			128
53 #define ALUA_FAILOVER_TIMEOUT		60
54 #define ALUA_FAILOVER_RETRIES		5
55 #define ALUA_RTPG_DELAY_MSECS		5
56 
57 /* device handler flags */
58 #define ALUA_OPTIMIZE_STPG		0x01
59 #define ALUA_RTPG_EXT_HDR_UNSUPP	0x02
60 /* State machine flags */
61 #define ALUA_PG_RUN_RTPG		0x10
62 #define ALUA_PG_RUN_STPG		0x20
63 #define ALUA_PG_RUNNING			0x40
64 
65 static uint optimize_stpg;
66 module_param(optimize_stpg, uint, S_IRUGO|S_IWUSR);
67 MODULE_PARM_DESC(optimize_stpg, "Allow use of a non-optimized path, rather than sending a STPG, when implicit TPGS is supported (0=No,1=Yes). Default is 0.");
68 
69 static LIST_HEAD(port_group_list);
70 static DEFINE_SPINLOCK(port_group_lock);
71 static struct workqueue_struct *kaluad_wq;
72 
73 struct alua_port_group {
74 	struct kref		kref;
75 	struct rcu_head		rcu;
76 	struct list_head	node;
77 	struct list_head	dh_list;
78 	unsigned char		device_id_str[256];
79 	int			device_id_len;
80 	int			group_id;
81 	int			tpgs;
82 	int			state;
83 	int			pref;
84 	unsigned		flags; /* used for optimizing STPG */
85 	unsigned char		transition_tmo;
86 	unsigned long		expiry;
87 	unsigned long		interval;
88 	struct delayed_work	rtpg_work;
89 	spinlock_t		lock;
90 	struct list_head	rtpg_list;
91 	struct scsi_device	*rtpg_sdev;
92 };
93 
94 struct alua_dh_data {
95 	struct list_head	node;
96 	struct alua_port_group __rcu *pg;
97 	int			group_id;
98 	spinlock_t		pg_lock;
99 	struct scsi_device	*sdev;
100 	int			init_error;
101 	struct mutex		init_mutex;
102 };
103 
104 struct alua_queue_data {
105 	struct list_head	entry;
106 	activate_complete	callback_fn;
107 	void			*callback_data;
108 };
109 
110 #define ALUA_POLICY_SWITCH_CURRENT	0
111 #define ALUA_POLICY_SWITCH_ALL		1
112 
113 static void alua_rtpg_work(struct work_struct *work);
114 static bool alua_rtpg_queue(struct alua_port_group *pg,
115 			    struct scsi_device *sdev,
116 			    struct alua_queue_data *qdata, bool force);
117 static void alua_check(struct scsi_device *sdev, bool force);
118 
119 static void release_port_group(struct kref *kref)
120 {
121 	struct alua_port_group *pg;
122 
123 	pg = container_of(kref, struct alua_port_group, kref);
124 	if (pg->rtpg_sdev)
125 		flush_delayed_work(&pg->rtpg_work);
126 	spin_lock(&port_group_lock);
127 	list_del(&pg->node);
128 	spin_unlock(&port_group_lock);
129 	kfree_rcu(pg, rcu);
130 }
131 
132 /*
133  * submit_rtpg - Issue a REPORT TARGET GROUP STATES command
134  * @sdev: sdev the command should be sent to
135  */
136 static int submit_rtpg(struct scsi_device *sdev, unsigned char *buff,
137 		       int bufflen, struct scsi_sense_hdr *sshdr, int flags)
138 {
139 	u8 cdb[COMMAND_SIZE(MAINTENANCE_IN)];
140 	int req_flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
141 		REQ_FAILFAST_DRIVER;
142 
143 	/* Prepare the command. */
144 	memset(cdb, 0x0, COMMAND_SIZE(MAINTENANCE_IN));
145 	cdb[0] = MAINTENANCE_IN;
146 	if (!(flags & ALUA_RTPG_EXT_HDR_UNSUPP))
147 		cdb[1] = MI_REPORT_TARGET_PGS | MI_EXT_HDR_PARAM_FMT;
148 	else
149 		cdb[1] = MI_REPORT_TARGET_PGS;
150 	put_unaligned_be32(bufflen, &cdb[6]);
151 
152 	return scsi_execute(sdev, cdb, DMA_FROM_DEVICE, buff, bufflen, NULL,
153 			sshdr, ALUA_FAILOVER_TIMEOUT * HZ,
154 			ALUA_FAILOVER_RETRIES, req_flags, 0, NULL);
155 }
156 
157 /*
158  * submit_stpg - Issue a SET TARGET PORT GROUP command
159  *
160  * Currently we're only setting the current target port group state
161  * to 'active/optimized' and let the array firmware figure out
162  * the states of the remaining groups.
163  */
164 static int submit_stpg(struct scsi_device *sdev, int group_id,
165 		       struct scsi_sense_hdr *sshdr)
166 {
167 	u8 cdb[COMMAND_SIZE(MAINTENANCE_OUT)];
168 	unsigned char stpg_data[8];
169 	int stpg_len = 8;
170 	int req_flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
171 		REQ_FAILFAST_DRIVER;
172 
173 	/* Prepare the data buffer */
174 	memset(stpg_data, 0, stpg_len);
175 	stpg_data[4] = SCSI_ACCESS_STATE_OPTIMAL;
176 	put_unaligned_be16(group_id, &stpg_data[6]);
177 
178 	/* Prepare the command. */
179 	memset(cdb, 0x0, COMMAND_SIZE(MAINTENANCE_OUT));
180 	cdb[0] = MAINTENANCE_OUT;
181 	cdb[1] = MO_SET_TARGET_PGS;
182 	put_unaligned_be32(stpg_len, &cdb[6]);
183 
184 	return scsi_execute(sdev, cdb, DMA_TO_DEVICE, stpg_data, stpg_len, NULL,
185 			sshdr, ALUA_FAILOVER_TIMEOUT * HZ,
186 			ALUA_FAILOVER_RETRIES, req_flags, 0, NULL);
187 }
188 
189 static struct alua_port_group *alua_find_get_pg(char *id_str, size_t id_size,
190 						int group_id)
191 {
192 	struct alua_port_group *pg;
193 
194 	if (!id_str || !id_size || !strlen(id_str))
195 		return NULL;
196 
197 	list_for_each_entry(pg, &port_group_list, node) {
198 		if (pg->group_id != group_id)
199 			continue;
200 		if (!pg->device_id_len || pg->device_id_len != id_size)
201 			continue;
202 		if (strncmp(pg->device_id_str, id_str, id_size))
203 			continue;
204 		if (!kref_get_unless_zero(&pg->kref))
205 			continue;
206 		return pg;
207 	}
208 
209 	return NULL;
210 }
211 
212 /*
213  * alua_alloc_pg - Allocate a new port_group structure
214  * @sdev: scsi device
215  * @h: alua device_handler data
216  * @group_id: port group id
217  *
218  * Allocate a new port_group structure for a given
219  * device.
220  */
221 static struct alua_port_group *alua_alloc_pg(struct scsi_device *sdev,
222 					     int group_id, int tpgs)
223 {
224 	struct alua_port_group *pg, *tmp_pg;
225 
226 	pg = kzalloc(sizeof(struct alua_port_group), GFP_KERNEL);
227 	if (!pg)
228 		return ERR_PTR(-ENOMEM);
229 
230 	pg->device_id_len = scsi_vpd_lun_id(sdev, pg->device_id_str,
231 					    sizeof(pg->device_id_str));
232 	if (pg->device_id_len <= 0) {
233 		/*
234 		 * TPGS supported but no device identification found.
235 		 * Generate private device identification.
236 		 */
237 		sdev_printk(KERN_INFO, sdev,
238 			    "%s: No device descriptors found\n",
239 			    ALUA_DH_NAME);
240 		pg->device_id_str[0] = '\0';
241 		pg->device_id_len = 0;
242 	}
243 	pg->group_id = group_id;
244 	pg->tpgs = tpgs;
245 	pg->state = SCSI_ACCESS_STATE_OPTIMAL;
246 	if (optimize_stpg)
247 		pg->flags |= ALUA_OPTIMIZE_STPG;
248 	kref_init(&pg->kref);
249 	INIT_DELAYED_WORK(&pg->rtpg_work, alua_rtpg_work);
250 	INIT_LIST_HEAD(&pg->rtpg_list);
251 	INIT_LIST_HEAD(&pg->node);
252 	INIT_LIST_HEAD(&pg->dh_list);
253 	spin_lock_init(&pg->lock);
254 
255 	spin_lock(&port_group_lock);
256 	tmp_pg = alua_find_get_pg(pg->device_id_str, pg->device_id_len,
257 				  group_id);
258 	if (tmp_pg) {
259 		spin_unlock(&port_group_lock);
260 		kfree(pg);
261 		return tmp_pg;
262 	}
263 
264 	list_add(&pg->node, &port_group_list);
265 	spin_unlock(&port_group_lock);
266 
267 	return pg;
268 }
269 
270 /*
271  * alua_check_tpgs - Evaluate TPGS setting
272  * @sdev: device to be checked
273  *
274  * Examine the TPGS setting of the sdev to find out if ALUA
275  * is supported.
276  */
277 static int alua_check_tpgs(struct scsi_device *sdev)
278 {
279 	int tpgs = TPGS_MODE_NONE;
280 
281 	/*
282 	 * ALUA support for non-disk devices is fraught with
283 	 * difficulties, so disable it for now.
284 	 */
285 	if (sdev->type != TYPE_DISK) {
286 		sdev_printk(KERN_INFO, sdev,
287 			    "%s: disable for non-disk devices\n",
288 			    ALUA_DH_NAME);
289 		return tpgs;
290 	}
291 
292 	tpgs = scsi_device_tpgs(sdev);
293 	switch (tpgs) {
294 	case TPGS_MODE_EXPLICIT|TPGS_MODE_IMPLICIT:
295 		sdev_printk(KERN_INFO, sdev,
296 			    "%s: supports implicit and explicit TPGS\n",
297 			    ALUA_DH_NAME);
298 		break;
299 	case TPGS_MODE_EXPLICIT:
300 		sdev_printk(KERN_INFO, sdev, "%s: supports explicit TPGS\n",
301 			    ALUA_DH_NAME);
302 		break;
303 	case TPGS_MODE_IMPLICIT:
304 		sdev_printk(KERN_INFO, sdev, "%s: supports implicit TPGS\n",
305 			    ALUA_DH_NAME);
306 		break;
307 	case TPGS_MODE_NONE:
308 		sdev_printk(KERN_INFO, sdev, "%s: not supported\n",
309 			    ALUA_DH_NAME);
310 		break;
311 	default:
312 		sdev_printk(KERN_INFO, sdev,
313 			    "%s: unsupported TPGS setting %d\n",
314 			    ALUA_DH_NAME, tpgs);
315 		tpgs = TPGS_MODE_NONE;
316 		break;
317 	}
318 
319 	return tpgs;
320 }
321 
322 /*
323  * alua_check_vpd - Evaluate INQUIRY vpd page 0x83
324  * @sdev: device to be checked
325  *
326  * Extract the relative target port and the target port group
327  * descriptor from the list of identificators.
328  */
329 static int alua_check_vpd(struct scsi_device *sdev, struct alua_dh_data *h,
330 			  int tpgs)
331 {
332 	int rel_port = -1, group_id;
333 	struct alua_port_group *pg, *old_pg = NULL;
334 	bool pg_updated = false;
335 	unsigned long flags;
336 
337 	group_id = scsi_vpd_tpg_id(sdev, &rel_port);
338 	if (group_id < 0) {
339 		/*
340 		 * Internal error; TPGS supported but required
341 		 * VPD identification descriptors not present.
342 		 * Disable ALUA support
343 		 */
344 		sdev_printk(KERN_INFO, sdev,
345 			    "%s: No target port descriptors found\n",
346 			    ALUA_DH_NAME);
347 		return SCSI_DH_DEV_UNSUPP;
348 	}
349 
350 	pg = alua_alloc_pg(sdev, group_id, tpgs);
351 	if (IS_ERR(pg)) {
352 		if (PTR_ERR(pg) == -ENOMEM)
353 			return SCSI_DH_NOMEM;
354 		return SCSI_DH_DEV_UNSUPP;
355 	}
356 	if (pg->device_id_len)
357 		sdev_printk(KERN_INFO, sdev,
358 			    "%s: device %s port group %x rel port %x\n",
359 			    ALUA_DH_NAME, pg->device_id_str,
360 			    group_id, rel_port);
361 	else
362 		sdev_printk(KERN_INFO, sdev,
363 			    "%s: port group %x rel port %x\n",
364 			    ALUA_DH_NAME, group_id, rel_port);
365 
366 	/* Check for existing port group references */
367 	spin_lock(&h->pg_lock);
368 	old_pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
369 	if (old_pg != pg) {
370 		/* port group has changed. Update to new port group */
371 		if (h->pg) {
372 			spin_lock_irqsave(&old_pg->lock, flags);
373 			list_del_rcu(&h->node);
374 			spin_unlock_irqrestore(&old_pg->lock, flags);
375 		}
376 		rcu_assign_pointer(h->pg, pg);
377 		pg_updated = true;
378 	}
379 
380 	spin_lock_irqsave(&pg->lock, flags);
381 	if (pg_updated)
382 		list_add_rcu(&h->node, &pg->dh_list);
383 	spin_unlock_irqrestore(&pg->lock, flags);
384 
385 	alua_rtpg_queue(rcu_dereference_protected(h->pg,
386 						  lockdep_is_held(&h->pg_lock)),
387 			sdev, NULL, true);
388 	spin_unlock(&h->pg_lock);
389 
390 	if (old_pg)
391 		kref_put(&old_pg->kref, release_port_group);
392 
393 	return SCSI_DH_OK;
394 }
395 
396 static char print_alua_state(unsigned char state)
397 {
398 	switch (state) {
399 	case SCSI_ACCESS_STATE_OPTIMAL:
400 		return 'A';
401 	case SCSI_ACCESS_STATE_ACTIVE:
402 		return 'N';
403 	case SCSI_ACCESS_STATE_STANDBY:
404 		return 'S';
405 	case SCSI_ACCESS_STATE_UNAVAILABLE:
406 		return 'U';
407 	case SCSI_ACCESS_STATE_LBA:
408 		return 'L';
409 	case SCSI_ACCESS_STATE_OFFLINE:
410 		return 'O';
411 	case SCSI_ACCESS_STATE_TRANSITIONING:
412 		return 'T';
413 	default:
414 		return 'X';
415 	}
416 }
417 
418 static int alua_check_sense(struct scsi_device *sdev,
419 			    struct scsi_sense_hdr *sense_hdr)
420 {
421 	switch (sense_hdr->sense_key) {
422 	case NOT_READY:
423 		if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x0a) {
424 			/*
425 			 * LUN Not Accessible - ALUA state transition
426 			 */
427 			alua_check(sdev, false);
428 			return NEEDS_RETRY;
429 		}
430 		break;
431 	case UNIT_ATTENTION:
432 		if (sense_hdr->asc == 0x29 && sense_hdr->ascq == 0x00) {
433 			/*
434 			 * Power On, Reset, or Bus Device Reset.
435 			 * Might have obscured a state transition,
436 			 * so schedule a recheck.
437 			 */
438 			alua_check(sdev, true);
439 			return ADD_TO_MLQUEUE;
440 		}
441 		if (sense_hdr->asc == 0x29 && sense_hdr->ascq == 0x04)
442 			/*
443 			 * Device internal reset
444 			 */
445 			return ADD_TO_MLQUEUE;
446 		if (sense_hdr->asc == 0x2a && sense_hdr->ascq == 0x01)
447 			/*
448 			 * Mode Parameters Changed
449 			 */
450 			return ADD_TO_MLQUEUE;
451 		if (sense_hdr->asc == 0x2a && sense_hdr->ascq == 0x06) {
452 			/*
453 			 * ALUA state changed
454 			 */
455 			alua_check(sdev, true);
456 			return ADD_TO_MLQUEUE;
457 		}
458 		if (sense_hdr->asc == 0x2a && sense_hdr->ascq == 0x07) {
459 			/*
460 			 * Implicit ALUA state transition failed
461 			 */
462 			alua_check(sdev, true);
463 			return ADD_TO_MLQUEUE;
464 		}
465 		if (sense_hdr->asc == 0x3f && sense_hdr->ascq == 0x03)
466 			/*
467 			 * Inquiry data has changed
468 			 */
469 			return ADD_TO_MLQUEUE;
470 		if (sense_hdr->asc == 0x3f && sense_hdr->ascq == 0x0e)
471 			/*
472 			 * REPORTED_LUNS_DATA_HAS_CHANGED is reported
473 			 * when switching controllers on targets like
474 			 * Intel Multi-Flex. We can just retry.
475 			 */
476 			return ADD_TO_MLQUEUE;
477 		break;
478 	}
479 
480 	return SCSI_RETURN_NOT_HANDLED;
481 }
482 
483 /*
484  * alua_tur - Send a TEST UNIT READY
485  * @sdev: device to which the TEST UNIT READY command should be send
486  *
487  * Send a TEST UNIT READY to @sdev to figure out the device state
488  * Returns SCSI_DH_RETRY if the sense code is NOT READY/ALUA TRANSITIONING,
489  * SCSI_DH_OK if no error occurred, and SCSI_DH_IO otherwise.
490  */
491 static int alua_tur(struct scsi_device *sdev)
492 {
493 	struct scsi_sense_hdr sense_hdr;
494 	int retval;
495 
496 	retval = scsi_test_unit_ready(sdev, ALUA_FAILOVER_TIMEOUT * HZ,
497 				      ALUA_FAILOVER_RETRIES, &sense_hdr);
498 	if (sense_hdr.sense_key == NOT_READY &&
499 	    sense_hdr.asc == 0x04 && sense_hdr.ascq == 0x0a)
500 		return SCSI_DH_RETRY;
501 	else if (retval)
502 		return SCSI_DH_IO;
503 	else
504 		return SCSI_DH_OK;
505 }
506 
507 /*
508  * alua_rtpg - Evaluate REPORT TARGET GROUP STATES
509  * @sdev: the device to be evaluated.
510  *
511  * Evaluate the Target Port Group State.
512  * Returns SCSI_DH_DEV_OFFLINED if the path is
513  * found to be unusable.
514  */
515 static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
516 {
517 	struct scsi_sense_hdr sense_hdr;
518 	struct alua_port_group *tmp_pg;
519 	int len, k, off, valid_states = 0, bufflen = ALUA_RTPG_SIZE;
520 	unsigned char *desc, *buff;
521 	unsigned err, retval;
522 	unsigned int tpg_desc_tbl_off;
523 	unsigned char orig_transition_tmo;
524 	unsigned long flags;
525 
526 	if (!pg->expiry) {
527 		unsigned long transition_tmo = ALUA_FAILOVER_TIMEOUT * HZ;
528 
529 		if (pg->transition_tmo)
530 			transition_tmo = pg->transition_tmo * HZ;
531 
532 		pg->expiry = round_jiffies_up(jiffies + transition_tmo);
533 	}
534 
535 	buff = kzalloc(bufflen, GFP_KERNEL);
536 	if (!buff)
537 		return SCSI_DH_DEV_TEMP_BUSY;
538 
539  retry:
540 	err = 0;
541 	retval = submit_rtpg(sdev, buff, bufflen, &sense_hdr, pg->flags);
542 
543 	if (retval) {
544 		if (!scsi_sense_valid(&sense_hdr)) {
545 			sdev_printk(KERN_INFO, sdev,
546 				    "%s: rtpg failed, result %d\n",
547 				    ALUA_DH_NAME, retval);
548 			kfree(buff);
549 			if (driver_byte(retval) == DRIVER_ERROR)
550 				return SCSI_DH_DEV_TEMP_BUSY;
551 			return SCSI_DH_IO;
552 		}
553 
554 		/*
555 		 * submit_rtpg() has failed on existing arrays
556 		 * when requesting extended header info, and
557 		 * the array doesn't support extended headers,
558 		 * even though it shouldn't according to T10.
559 		 * The retry without rtpg_ext_hdr_req set
560 		 * handles this.
561 		 */
562 		if (!(pg->flags & ALUA_RTPG_EXT_HDR_UNSUPP) &&
563 		    sense_hdr.sense_key == ILLEGAL_REQUEST &&
564 		    sense_hdr.asc == 0x24 && sense_hdr.ascq == 0) {
565 			pg->flags |= ALUA_RTPG_EXT_HDR_UNSUPP;
566 			goto retry;
567 		}
568 		/*
569 		 * Retry on ALUA state transition or if any
570 		 * UNIT ATTENTION occurred.
571 		 */
572 		if (sense_hdr.sense_key == NOT_READY &&
573 		    sense_hdr.asc == 0x04 && sense_hdr.ascq == 0x0a)
574 			err = SCSI_DH_RETRY;
575 		else if (sense_hdr.sense_key == UNIT_ATTENTION)
576 			err = SCSI_DH_RETRY;
577 		if (err == SCSI_DH_RETRY &&
578 		    pg->expiry != 0 && time_before(jiffies, pg->expiry)) {
579 			sdev_printk(KERN_ERR, sdev, "%s: rtpg retry\n",
580 				    ALUA_DH_NAME);
581 			scsi_print_sense_hdr(sdev, ALUA_DH_NAME, &sense_hdr);
582 			kfree(buff);
583 			return err;
584 		}
585 		sdev_printk(KERN_ERR, sdev, "%s: rtpg failed\n",
586 			    ALUA_DH_NAME);
587 		scsi_print_sense_hdr(sdev, ALUA_DH_NAME, &sense_hdr);
588 		kfree(buff);
589 		pg->expiry = 0;
590 		return SCSI_DH_IO;
591 	}
592 
593 	len = get_unaligned_be32(&buff[0]) + 4;
594 
595 	if (len > bufflen) {
596 		/* Resubmit with the correct length */
597 		kfree(buff);
598 		bufflen = len;
599 		buff = kmalloc(bufflen, GFP_KERNEL);
600 		if (!buff) {
601 			sdev_printk(KERN_WARNING, sdev,
602 				    "%s: kmalloc buffer failed\n",__func__);
603 			/* Temporary failure, bypass */
604 			pg->expiry = 0;
605 			return SCSI_DH_DEV_TEMP_BUSY;
606 		}
607 		goto retry;
608 	}
609 
610 	orig_transition_tmo = pg->transition_tmo;
611 	if ((buff[4] & RTPG_FMT_MASK) == RTPG_FMT_EXT_HDR && buff[5] != 0)
612 		pg->transition_tmo = buff[5];
613 	else
614 		pg->transition_tmo = ALUA_FAILOVER_TIMEOUT;
615 
616 	if (orig_transition_tmo != pg->transition_tmo) {
617 		sdev_printk(KERN_INFO, sdev,
618 			    "%s: transition timeout set to %d seconds\n",
619 			    ALUA_DH_NAME, pg->transition_tmo);
620 		pg->expiry = jiffies + pg->transition_tmo * HZ;
621 	}
622 
623 	if ((buff[4] & RTPG_FMT_MASK) == RTPG_FMT_EXT_HDR)
624 		tpg_desc_tbl_off = 8;
625 	else
626 		tpg_desc_tbl_off = 4;
627 
628 	for (k = tpg_desc_tbl_off, desc = buff + tpg_desc_tbl_off;
629 	     k < len;
630 	     k += off, desc += off) {
631 		u16 group_id = get_unaligned_be16(&desc[2]);
632 
633 		spin_lock_irqsave(&port_group_lock, flags);
634 		tmp_pg = alua_find_get_pg(pg->device_id_str, pg->device_id_len,
635 					  group_id);
636 		spin_unlock_irqrestore(&port_group_lock, flags);
637 		if (tmp_pg) {
638 			if (spin_trylock_irqsave(&tmp_pg->lock, flags)) {
639 				if ((tmp_pg == pg) ||
640 				    !(tmp_pg->flags & ALUA_PG_RUNNING)) {
641 					struct alua_dh_data *h;
642 
643 					tmp_pg->state = desc[0] & 0x0f;
644 					tmp_pg->pref = desc[0] >> 7;
645 					rcu_read_lock();
646 					list_for_each_entry_rcu(h,
647 						&tmp_pg->dh_list, node) {
648 						/* h->sdev should always be valid */
649 						BUG_ON(!h->sdev);
650 						h->sdev->access_state = desc[0];
651 					}
652 					rcu_read_unlock();
653 				}
654 				if (tmp_pg == pg)
655 					valid_states = desc[1];
656 				spin_unlock_irqrestore(&tmp_pg->lock, flags);
657 			}
658 			kref_put(&tmp_pg->kref, release_port_group);
659 		}
660 		off = 8 + (desc[7] * 4);
661 	}
662 
663 	spin_lock_irqsave(&pg->lock, flags);
664 	sdev_printk(KERN_INFO, sdev,
665 		    "%s: port group %02x state %c %s supports %c%c%c%c%c%c%c\n",
666 		    ALUA_DH_NAME, pg->group_id, print_alua_state(pg->state),
667 		    pg->pref ? "preferred" : "non-preferred",
668 		    valid_states&TPGS_SUPPORT_TRANSITION?'T':'t',
669 		    valid_states&TPGS_SUPPORT_OFFLINE?'O':'o',
670 		    valid_states&TPGS_SUPPORT_LBA_DEPENDENT?'L':'l',
671 		    valid_states&TPGS_SUPPORT_UNAVAILABLE?'U':'u',
672 		    valid_states&TPGS_SUPPORT_STANDBY?'S':'s',
673 		    valid_states&TPGS_SUPPORT_NONOPTIMIZED?'N':'n',
674 		    valid_states&TPGS_SUPPORT_OPTIMIZED?'A':'a');
675 
676 	switch (pg->state) {
677 	case SCSI_ACCESS_STATE_TRANSITIONING:
678 		if (time_before(jiffies, pg->expiry)) {
679 			/* State transition, retry */
680 			pg->interval = 2;
681 			err = SCSI_DH_RETRY;
682 		} else {
683 			struct alua_dh_data *h;
684 
685 			/* Transitioning time exceeded, set port to standby */
686 			err = SCSI_DH_IO;
687 			pg->state = SCSI_ACCESS_STATE_STANDBY;
688 			pg->expiry = 0;
689 			rcu_read_lock();
690 			list_for_each_entry_rcu(h, &pg->dh_list, node) {
691 				BUG_ON(!h->sdev);
692 				h->sdev->access_state =
693 					(pg->state & SCSI_ACCESS_STATE_MASK);
694 				if (pg->pref)
695 					h->sdev->access_state |=
696 						SCSI_ACCESS_STATE_PREFERRED;
697 			}
698 			rcu_read_unlock();
699 		}
700 		break;
701 	case SCSI_ACCESS_STATE_OFFLINE:
702 		/* Path unusable */
703 		err = SCSI_DH_DEV_OFFLINED;
704 		pg->expiry = 0;
705 		break;
706 	default:
707 		/* Useable path if active */
708 		err = SCSI_DH_OK;
709 		pg->expiry = 0;
710 		break;
711 	}
712 	spin_unlock_irqrestore(&pg->lock, flags);
713 	kfree(buff);
714 	return err;
715 }
716 
717 /*
718  * alua_stpg - Issue a SET TARGET PORT GROUP command
719  *
720  * Issue a SET TARGET PORT GROUP command and evaluate the
721  * response. Returns SCSI_DH_RETRY per default to trigger
722  * a re-evaluation of the target group state or SCSI_DH_OK
723  * if no further action needs to be taken.
724  */
725 static unsigned alua_stpg(struct scsi_device *sdev, struct alua_port_group *pg)
726 {
727 	int retval;
728 	struct scsi_sense_hdr sense_hdr;
729 
730 	if (!(pg->tpgs & TPGS_MODE_EXPLICIT)) {
731 		/* Only implicit ALUA supported, retry */
732 		return SCSI_DH_RETRY;
733 	}
734 	switch (pg->state) {
735 	case SCSI_ACCESS_STATE_OPTIMAL:
736 		return SCSI_DH_OK;
737 	case SCSI_ACCESS_STATE_ACTIVE:
738 		if ((pg->flags & ALUA_OPTIMIZE_STPG) &&
739 		    !pg->pref &&
740 		    (pg->tpgs & TPGS_MODE_IMPLICIT))
741 			return SCSI_DH_OK;
742 		break;
743 	case SCSI_ACCESS_STATE_STANDBY:
744 	case SCSI_ACCESS_STATE_UNAVAILABLE:
745 		break;
746 	case SCSI_ACCESS_STATE_OFFLINE:
747 		return SCSI_DH_IO;
748 	case SCSI_ACCESS_STATE_TRANSITIONING:
749 		break;
750 	default:
751 		sdev_printk(KERN_INFO, sdev,
752 			    "%s: stpg failed, unhandled TPGS state %d",
753 			    ALUA_DH_NAME, pg->state);
754 		return SCSI_DH_NOSYS;
755 	}
756 	retval = submit_stpg(sdev, pg->group_id, &sense_hdr);
757 
758 	if (retval) {
759 		if (!scsi_sense_valid(&sense_hdr)) {
760 			sdev_printk(KERN_INFO, sdev,
761 				    "%s: stpg failed, result %d",
762 				    ALUA_DH_NAME, retval);
763 			if (driver_byte(retval) == DRIVER_ERROR)
764 				return SCSI_DH_DEV_TEMP_BUSY;
765 		} else {
766 			sdev_printk(KERN_INFO, sdev, "%s: stpg failed\n",
767 				    ALUA_DH_NAME);
768 			scsi_print_sense_hdr(sdev, ALUA_DH_NAME, &sense_hdr);
769 		}
770 	}
771 	/* Retry RTPG */
772 	return SCSI_DH_RETRY;
773 }
774 
775 static void alua_rtpg_work(struct work_struct *work)
776 {
777 	struct alua_port_group *pg =
778 		container_of(work, struct alua_port_group, rtpg_work.work);
779 	struct scsi_device *sdev;
780 	LIST_HEAD(qdata_list);
781 	int err = SCSI_DH_OK;
782 	struct alua_queue_data *qdata, *tmp;
783 	unsigned long flags;
784 
785 	spin_lock_irqsave(&pg->lock, flags);
786 	sdev = pg->rtpg_sdev;
787 	if (!sdev) {
788 		WARN_ON(pg->flags & ALUA_PG_RUN_RTPG);
789 		WARN_ON(pg->flags & ALUA_PG_RUN_STPG);
790 		spin_unlock_irqrestore(&pg->lock, flags);
791 		kref_put(&pg->kref, release_port_group);
792 		return;
793 	}
794 	pg->flags |= ALUA_PG_RUNNING;
795 	if (pg->flags & ALUA_PG_RUN_RTPG) {
796 		int state = pg->state;
797 
798 		pg->flags &= ~ALUA_PG_RUN_RTPG;
799 		spin_unlock_irqrestore(&pg->lock, flags);
800 		if (state == SCSI_ACCESS_STATE_TRANSITIONING) {
801 			if (alua_tur(sdev) == SCSI_DH_RETRY) {
802 				spin_lock_irqsave(&pg->lock, flags);
803 				pg->flags &= ~ALUA_PG_RUNNING;
804 				pg->flags |= ALUA_PG_RUN_RTPG;
805 				spin_unlock_irqrestore(&pg->lock, flags);
806 				queue_delayed_work(kaluad_wq, &pg->rtpg_work,
807 						   pg->interval * HZ);
808 				return;
809 			}
810 			/* Send RTPG on failure or if TUR indicates SUCCESS */
811 		}
812 		err = alua_rtpg(sdev, pg);
813 		spin_lock_irqsave(&pg->lock, flags);
814 		if (err == SCSI_DH_RETRY || pg->flags & ALUA_PG_RUN_RTPG) {
815 			pg->flags &= ~ALUA_PG_RUNNING;
816 			pg->flags |= ALUA_PG_RUN_RTPG;
817 			spin_unlock_irqrestore(&pg->lock, flags);
818 			queue_delayed_work(kaluad_wq, &pg->rtpg_work,
819 					   pg->interval * HZ);
820 			return;
821 		}
822 		if (err != SCSI_DH_OK)
823 			pg->flags &= ~ALUA_PG_RUN_STPG;
824 	}
825 	if (pg->flags & ALUA_PG_RUN_STPG) {
826 		pg->flags &= ~ALUA_PG_RUN_STPG;
827 		spin_unlock_irqrestore(&pg->lock, flags);
828 		err = alua_stpg(sdev, pg);
829 		spin_lock_irqsave(&pg->lock, flags);
830 		if (err == SCSI_DH_RETRY || pg->flags & ALUA_PG_RUN_RTPG) {
831 			pg->flags |= ALUA_PG_RUN_RTPG;
832 			pg->interval = 0;
833 			pg->flags &= ~ALUA_PG_RUNNING;
834 			spin_unlock_irqrestore(&pg->lock, flags);
835 			queue_delayed_work(kaluad_wq, &pg->rtpg_work,
836 					   pg->interval * HZ);
837 			return;
838 		}
839 	}
840 
841 	list_splice_init(&pg->rtpg_list, &qdata_list);
842 	pg->rtpg_sdev = NULL;
843 	spin_unlock_irqrestore(&pg->lock, flags);
844 
845 	list_for_each_entry_safe(qdata, tmp, &qdata_list, entry) {
846 		list_del(&qdata->entry);
847 		if (qdata->callback_fn)
848 			qdata->callback_fn(qdata->callback_data, err);
849 		kfree(qdata);
850 	}
851 	spin_lock_irqsave(&pg->lock, flags);
852 	pg->flags &= ~ALUA_PG_RUNNING;
853 	spin_unlock_irqrestore(&pg->lock, flags);
854 	scsi_device_put(sdev);
855 	kref_put(&pg->kref, release_port_group);
856 }
857 
858 /**
859  * alua_rtpg_queue() - cause RTPG to be submitted asynchronously
860  *
861  * Returns true if and only if alua_rtpg_work() will be called asynchronously.
862  * That function is responsible for calling @qdata->fn().
863  */
864 static bool alua_rtpg_queue(struct alua_port_group *pg,
865 			    struct scsi_device *sdev,
866 			    struct alua_queue_data *qdata, bool force)
867 {
868 	int start_queue = 0;
869 	unsigned long flags;
870 	if (WARN_ON_ONCE(!pg) || scsi_device_get(sdev))
871 		return false;
872 
873 	spin_lock_irqsave(&pg->lock, flags);
874 	if (qdata) {
875 		list_add_tail(&qdata->entry, &pg->rtpg_list);
876 		pg->flags |= ALUA_PG_RUN_STPG;
877 		force = true;
878 	}
879 	if (pg->rtpg_sdev == NULL) {
880 		pg->interval = 0;
881 		pg->flags |= ALUA_PG_RUN_RTPG;
882 		kref_get(&pg->kref);
883 		pg->rtpg_sdev = sdev;
884 		start_queue = 1;
885 	} else if (!(pg->flags & ALUA_PG_RUN_RTPG) && force) {
886 		pg->flags |= ALUA_PG_RUN_RTPG;
887 		/* Do not queue if the worker is already running */
888 		if (!(pg->flags & ALUA_PG_RUNNING)) {
889 			kref_get(&pg->kref);
890 			start_queue = 1;
891 		}
892 	}
893 
894 	spin_unlock_irqrestore(&pg->lock, flags);
895 
896 	if (start_queue) {
897 		if (queue_delayed_work(kaluad_wq, &pg->rtpg_work,
898 				msecs_to_jiffies(ALUA_RTPG_DELAY_MSECS)))
899 			sdev = NULL;
900 		else
901 			kref_put(&pg->kref, release_port_group);
902 	}
903 	if (sdev)
904 		scsi_device_put(sdev);
905 
906 	return true;
907 }
908 
909 /*
910  * alua_initialize - Initialize ALUA state
911  * @sdev: the device to be initialized
912  *
913  * For the prep_fn to work correctly we have
914  * to initialize the ALUA state for the device.
915  */
916 static int alua_initialize(struct scsi_device *sdev, struct alua_dh_data *h)
917 {
918 	int err = SCSI_DH_DEV_UNSUPP, tpgs;
919 
920 	mutex_lock(&h->init_mutex);
921 	tpgs = alua_check_tpgs(sdev);
922 	if (tpgs != TPGS_MODE_NONE)
923 		err = alua_check_vpd(sdev, h, tpgs);
924 	h->init_error = err;
925 	mutex_unlock(&h->init_mutex);
926 	return err;
927 }
928 /*
929  * alua_set_params - set/unset the optimize flag
930  * @sdev: device on the path to be activated
931  * params - parameters in the following format
932  *      "no_of_params\0param1\0param2\0param3\0...\0"
933  * For example, to set the flag pass the following parameters
934  * from multipath.conf
935  *     hardware_handler        "2 alua 1"
936  */
937 static int alua_set_params(struct scsi_device *sdev, const char *params)
938 {
939 	struct alua_dh_data *h = sdev->handler_data;
940 	struct alua_port_group *pg = NULL;
941 	unsigned int optimize = 0, argc;
942 	const char *p = params;
943 	int result = SCSI_DH_OK;
944 	unsigned long flags;
945 
946 	if ((sscanf(params, "%u", &argc) != 1) || (argc != 1))
947 		return -EINVAL;
948 
949 	while (*p++)
950 		;
951 	if ((sscanf(p, "%u", &optimize) != 1) || (optimize > 1))
952 		return -EINVAL;
953 
954 	rcu_read_lock();
955 	pg = rcu_dereference(h->pg);
956 	if (!pg) {
957 		rcu_read_unlock();
958 		return -ENXIO;
959 	}
960 	spin_lock_irqsave(&pg->lock, flags);
961 	if (optimize)
962 		pg->flags |= ALUA_OPTIMIZE_STPG;
963 	else
964 		pg->flags &= ~ALUA_OPTIMIZE_STPG;
965 	spin_unlock_irqrestore(&pg->lock, flags);
966 	rcu_read_unlock();
967 
968 	return result;
969 }
970 
971 /*
972  * alua_activate - activate a path
973  * @sdev: device on the path to be activated
974  *
975  * We're currently switching the port group to be activated only and
976  * let the array figure out the rest.
977  * There may be other arrays which require us to switch all port groups
978  * based on a certain policy. But until we actually encounter them it
979  * should be okay.
980  */
981 static int alua_activate(struct scsi_device *sdev,
982 			activate_complete fn, void *data)
983 {
984 	struct alua_dh_data *h = sdev->handler_data;
985 	int err = SCSI_DH_OK;
986 	struct alua_queue_data *qdata;
987 	struct alua_port_group *pg;
988 
989 	qdata = kzalloc(sizeof(*qdata), GFP_KERNEL);
990 	if (!qdata) {
991 		err = SCSI_DH_RES_TEMP_UNAVAIL;
992 		goto out;
993 	}
994 	qdata->callback_fn = fn;
995 	qdata->callback_data = data;
996 
997 	mutex_lock(&h->init_mutex);
998 	rcu_read_lock();
999 	pg = rcu_dereference(h->pg);
1000 	if (!pg || !kref_get_unless_zero(&pg->kref)) {
1001 		rcu_read_unlock();
1002 		kfree(qdata);
1003 		err = h->init_error;
1004 		mutex_unlock(&h->init_mutex);
1005 		goto out;
1006 	}
1007 	rcu_read_unlock();
1008 	mutex_unlock(&h->init_mutex);
1009 
1010 	if (alua_rtpg_queue(pg, sdev, qdata, true))
1011 		fn = NULL;
1012 	else
1013 		err = SCSI_DH_DEV_OFFLINED;
1014 	kref_put(&pg->kref, release_port_group);
1015 out:
1016 	if (fn)
1017 		fn(data, err);
1018 	return 0;
1019 }
1020 
1021 /*
1022  * alua_check - check path status
1023  * @sdev: device on the path to be checked
1024  *
1025  * Check the device status
1026  */
1027 static void alua_check(struct scsi_device *sdev, bool force)
1028 {
1029 	struct alua_dh_data *h = sdev->handler_data;
1030 	struct alua_port_group *pg;
1031 
1032 	rcu_read_lock();
1033 	pg = rcu_dereference(h->pg);
1034 	if (!pg || !kref_get_unless_zero(&pg->kref)) {
1035 		rcu_read_unlock();
1036 		return;
1037 	}
1038 	rcu_read_unlock();
1039 
1040 	alua_rtpg_queue(pg, sdev, NULL, force);
1041 	kref_put(&pg->kref, release_port_group);
1042 }
1043 
1044 /*
1045  * alua_prep_fn - request callback
1046  *
1047  * Fail I/O to all paths not in state
1048  * active/optimized or active/non-optimized.
1049  */
1050 static int alua_prep_fn(struct scsi_device *sdev, struct request *req)
1051 {
1052 	struct alua_dh_data *h = sdev->handler_data;
1053 	struct alua_port_group *pg;
1054 	unsigned char state = SCSI_ACCESS_STATE_OPTIMAL;
1055 	int ret = BLKPREP_OK;
1056 
1057 	rcu_read_lock();
1058 	pg = rcu_dereference(h->pg);
1059 	if (pg)
1060 		state = pg->state;
1061 	rcu_read_unlock();
1062 	if (state == SCSI_ACCESS_STATE_TRANSITIONING)
1063 		ret = BLKPREP_DEFER;
1064 	else if (state != SCSI_ACCESS_STATE_OPTIMAL &&
1065 		 state != SCSI_ACCESS_STATE_ACTIVE &&
1066 		 state != SCSI_ACCESS_STATE_LBA) {
1067 		ret = BLKPREP_KILL;
1068 		req->rq_flags |= RQF_QUIET;
1069 	}
1070 	return ret;
1071 
1072 }
1073 
1074 static void alua_rescan(struct scsi_device *sdev)
1075 {
1076 	struct alua_dh_data *h = sdev->handler_data;
1077 
1078 	alua_initialize(sdev, h);
1079 }
1080 
1081 /*
1082  * alua_bus_attach - Attach device handler
1083  * @sdev: device to be attached to
1084  */
1085 static int alua_bus_attach(struct scsi_device *sdev)
1086 {
1087 	struct alua_dh_data *h;
1088 	int err, ret = -EINVAL;
1089 
1090 	h = kzalloc(sizeof(*h) , GFP_KERNEL);
1091 	if (!h)
1092 		return -ENOMEM;
1093 	spin_lock_init(&h->pg_lock);
1094 	rcu_assign_pointer(h->pg, NULL);
1095 	h->init_error = SCSI_DH_OK;
1096 	h->sdev = sdev;
1097 	INIT_LIST_HEAD(&h->node);
1098 
1099 	mutex_init(&h->init_mutex);
1100 	err = alua_initialize(sdev, h);
1101 	if (err == SCSI_DH_NOMEM)
1102 		ret = -ENOMEM;
1103 	if (err != SCSI_DH_OK && err != SCSI_DH_DEV_OFFLINED)
1104 		goto failed;
1105 
1106 	sdev->handler_data = h;
1107 	return 0;
1108 failed:
1109 	kfree(h);
1110 	return ret;
1111 }
1112 
1113 /*
1114  * alua_bus_detach - Detach device handler
1115  * @sdev: device to be detached from
1116  */
1117 static void alua_bus_detach(struct scsi_device *sdev)
1118 {
1119 	struct alua_dh_data *h = sdev->handler_data;
1120 	struct alua_port_group *pg;
1121 
1122 	spin_lock(&h->pg_lock);
1123 	pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
1124 	rcu_assign_pointer(h->pg, NULL);
1125 	h->sdev = NULL;
1126 	spin_unlock(&h->pg_lock);
1127 	if (pg) {
1128 		spin_lock_irq(&pg->lock);
1129 		list_del_rcu(&h->node);
1130 		spin_unlock_irq(&pg->lock);
1131 		kref_put(&pg->kref, release_port_group);
1132 	}
1133 	sdev->handler_data = NULL;
1134 	kfree(h);
1135 }
1136 
1137 static struct scsi_device_handler alua_dh = {
1138 	.name = ALUA_DH_NAME,
1139 	.module = THIS_MODULE,
1140 	.attach = alua_bus_attach,
1141 	.detach = alua_bus_detach,
1142 	.prep_fn = alua_prep_fn,
1143 	.check_sense = alua_check_sense,
1144 	.activate = alua_activate,
1145 	.rescan = alua_rescan,
1146 	.set_params = alua_set_params,
1147 };
1148 
1149 static int __init alua_init(void)
1150 {
1151 	int r;
1152 
1153 	kaluad_wq = alloc_workqueue("kaluad", WQ_MEM_RECLAIM, 0);
1154 	if (!kaluad_wq) {
1155 		/* Temporary failure, bypass */
1156 		return SCSI_DH_DEV_TEMP_BUSY;
1157 	}
1158 
1159 	r = scsi_register_device_handler(&alua_dh);
1160 	if (r != 0) {
1161 		printk(KERN_ERR "%s: Failed to register scsi device handler",
1162 			ALUA_DH_NAME);
1163 		destroy_workqueue(kaluad_wq);
1164 	}
1165 	return r;
1166 }
1167 
1168 static void __exit alua_exit(void)
1169 {
1170 	scsi_unregister_device_handler(&alua_dh);
1171 	destroy_workqueue(kaluad_wq);
1172 }
1173 
1174 module_init(alua_init);
1175 module_exit(alua_exit);
1176 
1177 MODULE_DESCRIPTION("DM Multipath ALUA support");
1178 MODULE_AUTHOR("Hannes Reinecke <hare@suse.de>");
1179 MODULE_LICENSE("GPL");
1180 MODULE_VERSION(ALUA_DH_VER);
1181