xref: /openbmc/u-boot/drivers/ddr/altera/sequencer.c (revision 17fdc9167fd8598d49f8edc930a5e5e649bd1299)
1 /*
2  * Copyright Altera Corporation (C) 2012-2015
3  *
4  * SPDX-License-Identifier:    BSD-3-Clause
5  */
6 
7 #include <common.h>
8 #include <asm/io.h>
9 #include <asm/arch/sdram.h>
10 #include "sequencer.h"
11 #include "sequencer_auto.h"
12 #include "sequencer_auto_ac_init.h"
13 #include "sequencer_auto_inst_init.h"
14 #include "sequencer_defines.h"
15 
16 static void scc_mgr_load_dqs_for_write_group(uint32_t write_group);
17 
18 static struct socfpga_sdr_rw_load_manager *sdr_rw_load_mgr_regs =
19 	(struct socfpga_sdr_rw_load_manager *)(SDR_PHYGRP_RWMGRGRP_ADDRESS | 0x800);
20 
21 static struct socfpga_sdr_rw_load_jump_manager *sdr_rw_load_jump_mgr_regs =
22 	(struct socfpga_sdr_rw_load_jump_manager *)(SDR_PHYGRP_RWMGRGRP_ADDRESS | 0xC00);
23 
24 static struct socfpga_sdr_reg_file *sdr_reg_file =
25 	(struct socfpga_sdr_reg_file *)SDR_PHYGRP_REGFILEGRP_ADDRESS;
26 
27 static struct socfpga_sdr_scc_mgr *sdr_scc_mgr =
28 	(struct socfpga_sdr_scc_mgr *)(SDR_PHYGRP_SCCGRP_ADDRESS | 0xe00);
29 
30 static struct socfpga_phy_mgr_cmd *phy_mgr_cmd =
31 	(struct socfpga_phy_mgr_cmd *)SDR_PHYGRP_PHYMGRGRP_ADDRESS;
32 
33 static struct socfpga_phy_mgr_cfg *phy_mgr_cfg =
34 	(struct socfpga_phy_mgr_cfg *)(SDR_PHYGRP_PHYMGRGRP_ADDRESS | 0x40);
35 
36 static struct socfpga_data_mgr *data_mgr =
37 	(struct socfpga_data_mgr *)SDR_PHYGRP_DATAMGRGRP_ADDRESS;
38 
39 #define DELTA_D		1
40 
41 /*
42  * In order to reduce ROM size, most of the selectable calibration steps are
43  * decided at compile time based on the user's calibration mode selection,
44  * as captured by the STATIC_CALIB_STEPS selection below.
45  *
46  * However, to support simulation-time selection of fast simulation mode, where
47  * we skip everything except the bare minimum, we need a few of the steps to
48  * be dynamic.  In those cases, we either use the DYNAMIC_CALIB_STEPS for the
49  * check, which is based on the rtl-supplied value, or we dynamically compute
50  * the value to use based on the dynamically-chosen calibration mode
51  */
52 
53 #define DLEVEL 0
54 #define STATIC_IN_RTL_SIM 0
55 #define STATIC_SKIP_DELAY_LOOPS 0
56 
57 #define STATIC_CALIB_STEPS (STATIC_IN_RTL_SIM | CALIB_SKIP_FULL_TEST | \
58 	STATIC_SKIP_DELAY_LOOPS)
59 
60 /* calibration steps requested by the rtl */
61 uint16_t dyn_calib_steps;
62 
63 /*
64  * To make CALIB_SKIP_DELAY_LOOPS a dynamic conditional option
65  * instead of static, we use boolean logic to select between
66  * non-skip and skip values
67  *
68  * The mask is set to include all bits when not-skipping, but is
69  * zero when skipping
70  */
71 
72 uint16_t skip_delay_mask;	/* mask off bits when skipping/not-skipping */
73 
74 #define SKIP_DELAY_LOOP_VALUE_OR_ZERO(non_skip_value) \
75 	((non_skip_value) & skip_delay_mask)
76 
77 struct gbl_type *gbl;
78 struct param_type *param;
79 uint32_t curr_shadow_reg;
80 
81 static uint32_t rw_mgr_mem_calibrate_write_test(uint32_t rank_bgn,
82 	uint32_t write_group, uint32_t use_dm,
83 	uint32_t all_correct, uint32_t *bit_chk, uint32_t all_ranks);
84 
85 static void set_failing_group_stage(uint32_t group, uint32_t stage,
86 	uint32_t substage)
87 {
88 	/*
89 	 * Only set the global stage if there was not been any other
90 	 * failing group
91 	 */
92 	if (gbl->error_stage == CAL_STAGE_NIL)	{
93 		gbl->error_substage = substage;
94 		gbl->error_stage = stage;
95 		gbl->error_group = group;
96 	}
97 }
98 
99 static void reg_file_set_group(uint32_t set_group)
100 {
101 	u32 addr = (u32)&sdr_reg_file->cur_stage;
102 
103 	/* Read the current group and stage */
104 	uint32_t cur_stage_group = readl(addr);
105 
106 	/* Clear the group */
107 	cur_stage_group &= 0x0000FFFF;
108 
109 	/* Set the group */
110 	cur_stage_group |= (set_group << 16);
111 
112 	/* Write the data back */
113 	writel(cur_stage_group, addr);
114 }
115 
116 static void reg_file_set_stage(uint32_t set_stage)
117 {
118 	u32 addr = (u32)&sdr_reg_file->cur_stage;
119 
120 	/* Read the current group and stage */
121 	uint32_t cur_stage_group = readl(addr);
122 
123 	/* Clear the stage and substage */
124 	cur_stage_group &= 0xFFFF0000;
125 
126 	/* Set the stage */
127 	cur_stage_group |= (set_stage & 0x000000FF);
128 
129 	/* Write the data back */
130 	writel(cur_stage_group, addr);
131 }
132 
133 static void reg_file_set_sub_stage(uint32_t set_sub_stage)
134 {
135 	u32 addr = (u32)&sdr_reg_file->cur_stage;
136 
137 	/* Read the current group and stage */
138 	uint32_t cur_stage_group = readl(addr);
139 
140 	/* Clear the substage */
141 	cur_stage_group &= 0xFFFF00FF;
142 
143 	/* Set the sub stage */
144 	cur_stage_group |= ((set_sub_stage << 8) & 0x0000FF00);
145 
146 	/* Write the data back */
147 	writel(cur_stage_group, addr);
148 }
149 
150 static void initialize(void)
151 {
152 	u32 addr = (u32)&phy_mgr_cfg->mux_sel;
153 
154 	debug("%s:%d\n", __func__, __LINE__);
155 	/* USER calibration has control over path to memory */
156 	/*
157 	 * In Hard PHY this is a 2-bit control:
158 	 * 0: AFI Mux Select
159 	 * 1: DDIO Mux Select
160 	 */
161 	writel(0x3, addr);
162 
163 	/* USER memory clock is not stable we begin initialization  */
164 	addr = (u32)&phy_mgr_cfg->reset_mem_stbl;
165 	writel(0, addr);
166 
167 	/* USER calibration status all set to zero */
168 	addr = (u32)&phy_mgr_cfg->cal_status;
169 	writel(0, addr);
170 
171 	addr = (u32)&phy_mgr_cfg->cal_debug_info;
172 	writel(0, addr);
173 
174 	if ((dyn_calib_steps & CALIB_SKIP_ALL) != CALIB_SKIP_ALL) {
175 		param->read_correct_mask_vg  = ((uint32_t)1 <<
176 			(RW_MGR_MEM_DQ_PER_READ_DQS /
177 			RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS)) - 1;
178 		param->write_correct_mask_vg = ((uint32_t)1 <<
179 			(RW_MGR_MEM_DQ_PER_READ_DQS /
180 			RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS)) - 1;
181 		param->read_correct_mask     = ((uint32_t)1 <<
182 			RW_MGR_MEM_DQ_PER_READ_DQS) - 1;
183 		param->write_correct_mask    = ((uint32_t)1 <<
184 			RW_MGR_MEM_DQ_PER_WRITE_DQS) - 1;
185 		param->dm_correct_mask       = ((uint32_t)1 <<
186 			(RW_MGR_MEM_DATA_WIDTH / RW_MGR_MEM_DATA_MASK_WIDTH))
187 			- 1;
188 	}
189 }
190 
191 static void set_rank_and_odt_mask(uint32_t rank, uint32_t odt_mode)
192 {
193 	uint32_t odt_mask_0 = 0;
194 	uint32_t odt_mask_1 = 0;
195 	uint32_t cs_and_odt_mask;
196 	uint32_t addr;
197 
198 	if (odt_mode == RW_MGR_ODT_MODE_READ_WRITE) {
199 		if (RW_MGR_MEM_NUMBER_OF_RANKS == 1) {
200 			/*
201 			 * 1 Rank
202 			 * Read: ODT = 0
203 			 * Write: ODT = 1
204 			 */
205 			odt_mask_0 = 0x0;
206 			odt_mask_1 = 0x1;
207 		} else if (RW_MGR_MEM_NUMBER_OF_RANKS == 2) {
208 			/* 2 Ranks */
209 			if (RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM == 1) {
210 				/* - Dual-Slot , Single-Rank
211 				 * (1 chip-select per DIMM)
212 				 * OR
213 				 * - RDIMM, 4 total CS (2 CS per DIMM)
214 				 * means 2 DIMM
215 				 * Since MEM_NUMBER_OF_RANKS is 2 they are
216 				 * both single rank
217 				 * with 2 CS each (special for RDIMM)
218 				 * Read: Turn on ODT on the opposite rank
219 				 * Write: Turn on ODT on all ranks
220 				 */
221 				odt_mask_0 = 0x3 & ~(1 << rank);
222 				odt_mask_1 = 0x3;
223 			} else {
224 				/*
225 				 * USER - Single-Slot , Dual-rank DIMMs
226 				 * (2 chip-selects per DIMM)
227 				 * USER Read: Turn on ODT off on all ranks
228 				 * USER Write: Turn on ODT on active rank
229 				 */
230 				odt_mask_0 = 0x0;
231 				odt_mask_1 = 0x3 & (1 << rank);
232 			}
233 		} else {
234 			/* 4 Ranks
235 			 * Read:
236 			 * ----------+-----------------------+
237 			 *           |                       |
238 			 *           |         ODT           |
239 			 * Read From +-----------------------+
240 			 *   Rank    |  3  |  2  |  1  |  0  |
241 			 * ----------+-----+-----+-----+-----+
242 			 *     0     |  0  |  1  |  0  |  0  |
243 			 *     1     |  1  |  0  |  0  |  0  |
244 			 *     2     |  0  |  0  |  0  |  1  |
245 			 *     3     |  0  |  0  |  1  |  0  |
246 			 * ----------+-----+-----+-----+-----+
247 			 *
248 			 * Write:
249 			 * ----------+-----------------------+
250 			 *           |                       |
251 			 *           |         ODT           |
252 			 * Write To  +-----------------------+
253 			 *   Rank    |  3  |  2  |  1  |  0  |
254 			 * ----------+-----+-----+-----+-----+
255 			 *     0     |  0  |  1  |  0  |  1  |
256 			 *     1     |  1  |  0  |  1  |  0  |
257 			 *     2     |  0  |  1  |  0  |  1  |
258 			 *     3     |  1  |  0  |  1  |  0  |
259 			 * ----------+-----+-----+-----+-----+
260 			 */
261 			switch (rank) {
262 			case 0:
263 				odt_mask_0 = 0x4;
264 				odt_mask_1 = 0x5;
265 				break;
266 			case 1:
267 				odt_mask_0 = 0x8;
268 				odt_mask_1 = 0xA;
269 				break;
270 			case 2:
271 				odt_mask_0 = 0x1;
272 				odt_mask_1 = 0x5;
273 				break;
274 			case 3:
275 				odt_mask_0 = 0x2;
276 				odt_mask_1 = 0xA;
277 				break;
278 			}
279 		}
280 	} else {
281 		odt_mask_0 = 0x0;
282 		odt_mask_1 = 0x0;
283 	}
284 
285 	cs_and_odt_mask =
286 		(0xFF & ~(1 << rank)) |
287 		((0xFF & odt_mask_0) << 8) |
288 		((0xFF & odt_mask_1) << 16);
289 	addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_SET_CS_AND_ODT_MASK_OFFSET;
290 	writel(cs_and_odt_mask, addr);
291 }
292 
293 static void scc_mgr_initialize(void)
294 {
295 	u32 addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_HHP_RFILE_OFFSET;
296 
297 	/*
298 	 * Clear register file for HPS
299 	 * 16 (2^4) is the size of the full register file in the scc mgr:
300 	 *	RFILE_DEPTH = log2(MEM_DQ_PER_DQS + 1 + MEM_DM_PER_DQS +
301 	 * MEM_IF_READ_DQS_WIDTH - 1) + 1;
302 	 */
303 	uint32_t i;
304 	for (i = 0; i < 16; i++) {
305 		debug_cond(DLEVEL == 1, "%s:%d: Clearing SCC RFILE index %u\n",
306 			   __func__, __LINE__, i);
307 		writel(0, addr + (i << 2));
308 	}
309 }
310 
311 static void scc_mgr_set_dqs_bus_in_delay(uint32_t read_group,
312 						uint32_t delay)
313 {
314 	u32 addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_DQS_IN_DELAY_OFFSET;
315 
316 	/* Load the setting in the SCC manager */
317 	writel(delay, addr + (read_group << 2));
318 }
319 
320 static void scc_mgr_set_dqs_io_in_delay(uint32_t write_group,
321 	uint32_t delay)
322 {
323 	u32 addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_IO_IN_DELAY_OFFSET;
324 
325 	writel(delay, addr + (RW_MGR_MEM_DQ_PER_WRITE_DQS << 2));
326 }
327 
328 static void scc_mgr_set_dqs_en_phase(uint32_t read_group, uint32_t phase)
329 {
330 	u32 addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_DQS_EN_PHASE_OFFSET;
331 
332 	/* Load the setting in the SCC manager */
333 	writel(phase, addr + (read_group << 2));
334 }
335 
336 static void scc_mgr_set_dqs_en_phase_all_ranks(uint32_t read_group,
337 					       uint32_t phase)
338 {
339 	uint32_t r;
340 	uint32_t update_scan_chains;
341 	uint32_t addr;
342 
343 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS;
344 	     r += NUM_RANKS_PER_SHADOW_REG) {
345 		/*
346 		 * USER although the h/w doesn't support different phases per
347 		 * shadow register, for simplicity our scc manager modeling
348 		 * keeps different phase settings per shadow reg, and it's
349 		 * important for us to keep them in sync to match h/w.
350 		 * for efficiency, the scan chain update should occur only
351 		 * once to sr0.
352 		 */
353 		update_scan_chains = (r == 0) ? 1 : 0;
354 
355 		scc_mgr_set_dqs_en_phase(read_group, phase);
356 
357 		if (update_scan_chains) {
358 			addr = (u32)&sdr_scc_mgr->dqs_ena;
359 			writel(read_group, addr);
360 
361 			addr = (u32)&sdr_scc_mgr->update;
362 			writel(0, addr);
363 		}
364 	}
365 }
366 
367 static void scc_mgr_set_dqdqs_output_phase(uint32_t write_group,
368 						  uint32_t phase)
369 {
370 	u32 addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_DQDQS_OUT_PHASE_OFFSET;
371 
372 	/* Load the setting in the SCC manager */
373 	writel(phase, addr + (write_group << 2));
374 }
375 
376 static void scc_mgr_set_dqdqs_output_phase_all_ranks(uint32_t write_group,
377 						     uint32_t phase)
378 {
379 	uint32_t r;
380 	uint32_t update_scan_chains;
381 	uint32_t addr;
382 
383 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS;
384 	     r += NUM_RANKS_PER_SHADOW_REG) {
385 		/*
386 		 * USER although the h/w doesn't support different phases per
387 		 * shadow register, for simplicity our scc manager modeling
388 		 * keeps different phase settings per shadow reg, and it's
389 		 * important for us to keep them in sync to match h/w.
390 		 * for efficiency, the scan chain update should occur only
391 		 * once to sr0.
392 		 */
393 		update_scan_chains = (r == 0) ? 1 : 0;
394 
395 		scc_mgr_set_dqdqs_output_phase(write_group, phase);
396 
397 		if (update_scan_chains) {
398 			addr = (u32)&sdr_scc_mgr->dqs_ena;
399 			writel(write_group, addr);
400 
401 			addr = (u32)&sdr_scc_mgr->update;
402 			writel(0, addr);
403 		}
404 	}
405 }
406 
407 static void scc_mgr_set_dqs_en_delay(uint32_t read_group, uint32_t delay)
408 {
409 	uint32_t addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_DQS_EN_DELAY_OFFSET;
410 
411 	/* Load the setting in the SCC manager */
412 	writel(delay + IO_DQS_EN_DELAY_OFFSET, addr +
413 	       (read_group << 2));
414 }
415 
416 static void scc_mgr_set_dqs_en_delay_all_ranks(uint32_t read_group,
417 					       uint32_t delay)
418 {
419 	uint32_t r;
420 	uint32_t addr;
421 
422 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS;
423 		r += NUM_RANKS_PER_SHADOW_REG) {
424 		scc_mgr_set_dqs_en_delay(read_group, delay);
425 
426 		addr = (u32)&sdr_scc_mgr->dqs_ena;
427 		writel(read_group, addr);
428 		/*
429 		 * In shadow register mode, the T11 settings are stored in
430 		 * registers in the core, which are updated by the DQS_ENA
431 		 * signals. Not issuing the SCC_MGR_UPD command allows us to
432 		 * save lots of rank switching overhead, by calling
433 		 * select_shadow_regs_for_update with update_scan_chains
434 		 * set to 0.
435 		 */
436 		addr = (u32)&sdr_scc_mgr->update;
437 		writel(0, addr);
438 	}
439 	/*
440 	 * In shadow register mode, the T11 settings are stored in
441 	 * registers in the core, which are updated by the DQS_ENA
442 	 * signals. Not issuing the SCC_MGR_UPD command allows us to
443 	 * save lots of rank switching overhead, by calling
444 	 * select_shadow_regs_for_update with update_scan_chains
445 	 * set to 0.
446 	 */
447 	addr = (u32)&sdr_scc_mgr->update;
448 	writel(0, addr);
449 }
450 
451 static void scc_mgr_set_oct_out1_delay(uint32_t write_group, uint32_t delay)
452 {
453 	uint32_t read_group;
454 	uint32_t addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_OCT_OUT1_DELAY_OFFSET;
455 
456 	/*
457 	 * Load the setting in the SCC manager
458 	 * Although OCT affects only write data, the OCT delay is controlled
459 	 * by the DQS logic block which is instantiated once per read group.
460 	 * For protocols where a write group consists of multiple read groups,
461 	 * the setting must be set multiple times.
462 	 */
463 	for (read_group = write_group * RW_MGR_MEM_IF_READ_DQS_WIDTH /
464 	     RW_MGR_MEM_IF_WRITE_DQS_WIDTH;
465 	     read_group < (write_group + 1) * RW_MGR_MEM_IF_READ_DQS_WIDTH /
466 	     RW_MGR_MEM_IF_WRITE_DQS_WIDTH; ++read_group)
467 		writel(delay, addr + (read_group << 2));
468 }
469 
470 static void scc_mgr_set_dq_out1_delay(uint32_t write_group,
471 				      uint32_t dq_in_group, uint32_t delay)
472 {
473 	uint32_t addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_IO_OUT1_DELAY_OFFSET;
474 
475 	/* Load the setting in the SCC manager */
476 	writel(delay, addr + (dq_in_group << 2));
477 }
478 
479 static void scc_mgr_set_dq_in_delay(uint32_t write_group,
480 	uint32_t dq_in_group, uint32_t delay)
481 {
482 	uint32_t addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_IO_IN_DELAY_OFFSET;
483 
484 	/* Load the setting in the SCC manager */
485 	writel(delay, addr + (dq_in_group << 2));
486 }
487 
488 static void scc_mgr_set_hhp_extras(void)
489 {
490 	/*
491 	 * Load the fixed setting in the SCC manager
492 	 * bits: 0:0 = 1'b1   - dqs bypass
493 	 * bits: 1:1 = 1'b1   - dq bypass
494 	 * bits: 4:2 = 3'b001   - rfifo_mode
495 	 * bits: 6:5 = 2'b01  - rfifo clock_select
496 	 * bits: 7:7 = 1'b0  - separate gating from ungating setting
497 	 * bits: 8:8 = 1'b0  - separate OE from Output delay setting
498 	 */
499 	uint32_t value = (0<<8) | (0<<7) | (1<<5) | (1<<2) | (1<<1) | (1<<0);
500 	uint32_t addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_HHP_GLOBALS_OFFSET;
501 
502 	writel(value, addr + SCC_MGR_HHP_EXTRAS_OFFSET);
503 }
504 
505 static void scc_mgr_set_dqs_out1_delay(uint32_t write_group,
506 					      uint32_t delay)
507 {
508 	uint32_t addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_IO_OUT1_DELAY_OFFSET;
509 
510 	/* Load the setting in the SCC manager */
511 	writel(delay, addr + (RW_MGR_MEM_DQ_PER_WRITE_DQS << 2));
512 }
513 
514 static void scc_mgr_set_dm_out1_delay(uint32_t write_group,
515 					     uint32_t dm, uint32_t delay)
516 {
517 	uint32_t addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_IO_OUT1_DELAY_OFFSET;
518 
519 	/* Load the setting in the SCC manager */
520 	writel(delay, addr +
521 		((RW_MGR_MEM_DQ_PER_WRITE_DQS + 1 + dm) << 2));
522 }
523 
524 /*
525  * USER Zero all DQS config
526  * TODO: maybe rename to scc_mgr_zero_dqs_config (or something)
527  */
528 static void scc_mgr_zero_all(void)
529 {
530 	uint32_t i, r;
531 	uint32_t addr;
532 
533 	/*
534 	 * USER Zero all DQS config settings, across all groups and all
535 	 * shadow registers
536 	 */
537 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r +=
538 	     NUM_RANKS_PER_SHADOW_REG) {
539 		for (i = 0; i < RW_MGR_MEM_IF_READ_DQS_WIDTH; i++) {
540 			/*
541 			 * The phases actually don't exist on a per-rank basis,
542 			 * but there's no harm updating them several times, so
543 			 * let's keep the code simple.
544 			 */
545 			scc_mgr_set_dqs_bus_in_delay(i, IO_DQS_IN_RESERVE);
546 			scc_mgr_set_dqs_en_phase(i, 0);
547 			scc_mgr_set_dqs_en_delay(i, 0);
548 		}
549 
550 		for (i = 0; i < RW_MGR_MEM_IF_WRITE_DQS_WIDTH; i++) {
551 			scc_mgr_set_dqdqs_output_phase(i, 0);
552 			/* av/cv don't have out2 */
553 			scc_mgr_set_oct_out1_delay(i, IO_DQS_OUT_RESERVE);
554 		}
555 	}
556 
557 	/* multicast to all DQS group enables */
558 	addr = (u32)&sdr_scc_mgr->dqs_ena;
559 	writel(0xff, addr);
560 
561 	addr = (u32)&sdr_scc_mgr->update;
562 	writel(0, addr);
563 }
564 
565 static void scc_set_bypass_mode(uint32_t write_group, uint32_t mode)
566 {
567 	uint32_t addr;
568 	/* mode = 0 : Do NOT bypass - Half Rate Mode */
569 	/* mode = 1 : Bypass - Full Rate Mode */
570 
571 	/* only need to set once for all groups, pins, dq, dqs, dm */
572 	if (write_group == 0) {
573 		debug_cond(DLEVEL == 1, "%s:%d Setting HHP Extras\n", __func__,
574 			   __LINE__);
575 		scc_mgr_set_hhp_extras();
576 		debug_cond(DLEVEL == 1, "%s:%d Done Setting HHP Extras\n",
577 			  __func__, __LINE__);
578 	}
579 	/* multicast to all DQ enables */
580 	addr = (u32)&sdr_scc_mgr->dq_ena;
581 	writel(0xff, addr);
582 
583 	addr = (u32)&sdr_scc_mgr->dm_ena;
584 	writel(0xff, addr);
585 
586 	/* update current DQS IO enable */
587 	addr = (u32)&sdr_scc_mgr->dqs_io_ena;
588 	writel(0, addr);
589 
590 	/* update the DQS logic */
591 	addr = (u32)&sdr_scc_mgr->dqs_ena;
592 	writel(write_group, addr);
593 
594 	/* hit update */
595 	addr = (u32)&sdr_scc_mgr->update;
596 	writel(0, addr);
597 }
598 
599 static void scc_mgr_zero_group(uint32_t write_group, uint32_t test_begin,
600 			       int32_t out_only)
601 {
602 	uint32_t i, r;
603 	uint32_t addr;
604 
605 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r +=
606 		NUM_RANKS_PER_SHADOW_REG) {
607 		/* Zero all DQ config settings */
608 		for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
609 			scc_mgr_set_dq_out1_delay(write_group, i, 0);
610 			if (!out_only)
611 				scc_mgr_set_dq_in_delay(write_group, i, 0);
612 		}
613 
614 		/* multicast to all DQ enables */
615 		addr = (u32)&sdr_scc_mgr->dq_ena;
616 		writel(0xff, addr);
617 
618 		/* Zero all DM config settings */
619 		for (i = 0; i < RW_MGR_NUM_DM_PER_WRITE_GROUP; i++) {
620 			scc_mgr_set_dm_out1_delay(write_group, i, 0);
621 		}
622 
623 		/* multicast to all DM enables */
624 		addr = (u32)&sdr_scc_mgr->dm_ena;
625 		writel(0xff, addr);
626 
627 		/* zero all DQS io settings */
628 		if (!out_only)
629 			scc_mgr_set_dqs_io_in_delay(write_group, 0);
630 		/* av/cv don't have out2 */
631 		scc_mgr_set_dqs_out1_delay(write_group, IO_DQS_OUT_RESERVE);
632 		scc_mgr_set_oct_out1_delay(write_group, IO_DQS_OUT_RESERVE);
633 		scc_mgr_load_dqs_for_write_group(write_group);
634 
635 		/* multicast to all DQS IO enables (only 1) */
636 		addr = (u32)&sdr_scc_mgr->dqs_io_ena;
637 		writel(0, addr);
638 
639 		/* hit update to zero everything */
640 		addr = (u32)&sdr_scc_mgr->update;
641 		writel(0, addr);
642 	}
643 }
644 
645 /* load up dqs config settings */
646 static void scc_mgr_load_dqs(uint32_t dqs)
647 {
648 	uint32_t addr = (u32)&sdr_scc_mgr->dqs_ena;
649 
650 	writel(dqs, addr);
651 }
652 
653 static void scc_mgr_load_dqs_for_write_group(uint32_t write_group)
654 {
655 	uint32_t read_group;
656 	uint32_t addr = (u32)&sdr_scc_mgr->dqs_ena;
657 	/*
658 	 * Although OCT affects only write data, the OCT delay is controlled
659 	 * by the DQS logic block which is instantiated once per read group.
660 	 * For protocols where a write group consists of multiple read groups,
661 	 * the setting must be scanned multiple times.
662 	 */
663 	for (read_group = write_group * RW_MGR_MEM_IF_READ_DQS_WIDTH /
664 	     RW_MGR_MEM_IF_WRITE_DQS_WIDTH;
665 	     read_group < (write_group + 1) * RW_MGR_MEM_IF_READ_DQS_WIDTH /
666 	     RW_MGR_MEM_IF_WRITE_DQS_WIDTH; ++read_group)
667 		writel(read_group, addr);
668 }
669 
670 /* load up dqs io config settings */
671 static void scc_mgr_load_dqs_io(void)
672 {
673 	uint32_t addr = (u32)&sdr_scc_mgr->dqs_io_ena;
674 
675 	writel(0, addr);
676 }
677 
678 /* load up dq config settings */
679 static void scc_mgr_load_dq(uint32_t dq_in_group)
680 {
681 	uint32_t addr = (u32)&sdr_scc_mgr->dq_ena;
682 
683 	writel(dq_in_group, addr);
684 }
685 
686 /* load up dm config settings */
687 static void scc_mgr_load_dm(uint32_t dm)
688 {
689 	uint32_t addr = (u32)&sdr_scc_mgr->dm_ena;
690 
691 	writel(dm, addr);
692 }
693 
694 /*
695  * apply and load a particular input delay for the DQ pins in a group
696  * group_bgn is the index of the first dq pin (in the write group)
697  */
698 static void scc_mgr_apply_group_dq_in_delay(uint32_t write_group,
699 					    uint32_t group_bgn, uint32_t delay)
700 {
701 	uint32_t i, p;
702 
703 	for (i = 0, p = group_bgn; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++, p++) {
704 		scc_mgr_set_dq_in_delay(write_group, p, delay);
705 		scc_mgr_load_dq(p);
706 	}
707 }
708 
709 /* apply and load a particular output delay for the DQ pins in a group */
710 static void scc_mgr_apply_group_dq_out1_delay(uint32_t write_group,
711 					      uint32_t group_bgn,
712 					      uint32_t delay1)
713 {
714 	uint32_t i, p;
715 
716 	for (i = 0, p = group_bgn; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++, p++) {
717 		scc_mgr_set_dq_out1_delay(write_group, i, delay1);
718 		scc_mgr_load_dq(i);
719 	}
720 }
721 
722 /* apply and load a particular output delay for the DM pins in a group */
723 static void scc_mgr_apply_group_dm_out1_delay(uint32_t write_group,
724 					      uint32_t delay1)
725 {
726 	uint32_t i;
727 
728 	for (i = 0; i < RW_MGR_NUM_DM_PER_WRITE_GROUP; i++) {
729 		scc_mgr_set_dm_out1_delay(write_group, i, delay1);
730 		scc_mgr_load_dm(i);
731 	}
732 }
733 
734 
735 /* apply and load delay on both DQS and OCT out1 */
736 static void scc_mgr_apply_group_dqs_io_and_oct_out1(uint32_t write_group,
737 						    uint32_t delay)
738 {
739 	scc_mgr_set_dqs_out1_delay(write_group, delay);
740 	scc_mgr_load_dqs_io();
741 
742 	scc_mgr_set_oct_out1_delay(write_group, delay);
743 	scc_mgr_load_dqs_for_write_group(write_group);
744 }
745 
746 /* apply a delay to the entire output side: DQ, DM, DQS, OCT */
747 static void scc_mgr_apply_group_all_out_delay_add(uint32_t write_group,
748 						  uint32_t group_bgn,
749 						  uint32_t delay)
750 {
751 	uint32_t i, p, new_delay;
752 
753 	/* dq shift */
754 	for (i = 0, p = group_bgn; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++, p++) {
755 		new_delay = READ_SCC_DQ_OUT2_DELAY;
756 		new_delay += delay;
757 
758 		if (new_delay > IO_IO_OUT2_DELAY_MAX) {
759 			debug_cond(DLEVEL == 1, "%s:%d (%u, %u, %u) DQ[%u,%u]:\
760 				   %u > %lu => %lu", __func__, __LINE__,
761 				   write_group, group_bgn, delay, i, p, new_delay,
762 				   (long unsigned int)IO_IO_OUT2_DELAY_MAX,
763 				   (long unsigned int)IO_IO_OUT2_DELAY_MAX);
764 			new_delay = IO_IO_OUT2_DELAY_MAX;
765 		}
766 
767 		scc_mgr_load_dq(i);
768 	}
769 
770 	/* dm shift */
771 	for (i = 0; i < RW_MGR_NUM_DM_PER_WRITE_GROUP; i++) {
772 		new_delay = READ_SCC_DM_IO_OUT2_DELAY;
773 		new_delay += delay;
774 
775 		if (new_delay > IO_IO_OUT2_DELAY_MAX) {
776 			debug_cond(DLEVEL == 1, "%s:%d (%u, %u, %u) DM[%u]:\
777 				   %u > %lu => %lu\n",  __func__, __LINE__,
778 				   write_group, group_bgn, delay, i, new_delay,
779 				   (long unsigned int)IO_IO_OUT2_DELAY_MAX,
780 				   (long unsigned int)IO_IO_OUT2_DELAY_MAX);
781 			new_delay = IO_IO_OUT2_DELAY_MAX;
782 		}
783 
784 		scc_mgr_load_dm(i);
785 	}
786 
787 	/* dqs shift */
788 	new_delay = READ_SCC_DQS_IO_OUT2_DELAY;
789 	new_delay += delay;
790 
791 	if (new_delay > IO_IO_OUT2_DELAY_MAX) {
792 		debug_cond(DLEVEL == 1, "%s:%d (%u, %u, %u) DQS: %u > %d => %d;"
793 			   " adding %u to OUT1\n", __func__, __LINE__,
794 			   write_group, group_bgn, delay, new_delay,
795 			   IO_IO_OUT2_DELAY_MAX, IO_IO_OUT2_DELAY_MAX,
796 			   new_delay - IO_IO_OUT2_DELAY_MAX);
797 		scc_mgr_set_dqs_out1_delay(write_group, new_delay -
798 					   IO_IO_OUT2_DELAY_MAX);
799 		new_delay = IO_IO_OUT2_DELAY_MAX;
800 	}
801 
802 	scc_mgr_load_dqs_io();
803 
804 	/* oct shift */
805 	new_delay = READ_SCC_OCT_OUT2_DELAY;
806 	new_delay += delay;
807 
808 	if (new_delay > IO_IO_OUT2_DELAY_MAX) {
809 		debug_cond(DLEVEL == 1, "%s:%d (%u, %u, %u) DQS: %u > %d => %d;"
810 			   " adding %u to OUT1\n", __func__, __LINE__,
811 			   write_group, group_bgn, delay, new_delay,
812 			   IO_IO_OUT2_DELAY_MAX, IO_IO_OUT2_DELAY_MAX,
813 			   new_delay - IO_IO_OUT2_DELAY_MAX);
814 		scc_mgr_set_oct_out1_delay(write_group, new_delay -
815 					   IO_IO_OUT2_DELAY_MAX);
816 		new_delay = IO_IO_OUT2_DELAY_MAX;
817 	}
818 
819 	scc_mgr_load_dqs_for_write_group(write_group);
820 }
821 
822 /*
823  * USER apply a delay to the entire output side (DQ, DM, DQS, OCT)
824  * and to all ranks
825  */
826 static void scc_mgr_apply_group_all_out_delay_add_all_ranks(
827 	uint32_t write_group, uint32_t group_bgn, uint32_t delay)
828 {
829 	uint32_t r;
830 	uint32_t addr = (u32)&sdr_scc_mgr->update;
831 
832 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS;
833 		r += NUM_RANKS_PER_SHADOW_REG) {
834 		scc_mgr_apply_group_all_out_delay_add(write_group,
835 						      group_bgn, delay);
836 		writel(0, addr);
837 	}
838 }
839 
840 /* optimization used to recover some slots in ddr3 inst_rom */
841 /* could be applied to other protocols if we wanted to */
842 static void set_jump_as_return(void)
843 {
844 	uint32_t addr = (u32)&sdr_rw_load_mgr_regs->load_cntr0;
845 
846 	/*
847 	 * to save space, we replace return with jump to special shared
848 	 * RETURN instruction so we set the counter to large value so that
849 	 * we always jump
850 	 */
851 	writel(0xff, addr);
852 	addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add0;
853 	writel(RW_MGR_RETURN, addr);
854 }
855 
856 /*
857  * should always use constants as argument to ensure all computations are
858  * performed at compile time
859  */
860 static void delay_for_n_mem_clocks(const uint32_t clocks)
861 {
862 	uint32_t afi_clocks;
863 	uint8_t inner = 0;
864 	uint8_t outer = 0;
865 	uint16_t c_loop = 0;
866 	uint32_t addr;
867 
868 	debug("%s:%d: clocks=%u ... start\n", __func__, __LINE__, clocks);
869 
870 
871 	afi_clocks = (clocks + AFI_RATE_RATIO-1) / AFI_RATE_RATIO;
872 	/* scale (rounding up) to get afi clocks */
873 
874 	/*
875 	 * Note, we don't bother accounting for being off a little bit
876 	 * because of a few extra instructions in outer loops
877 	 * Note, the loops have a test at the end, and do the test before
878 	 * the decrement, and so always perform the loop
879 	 * 1 time more than the counter value
880 	 */
881 	if (afi_clocks == 0) {
882 		;
883 	} else if (afi_clocks <= 0x100) {
884 		inner = afi_clocks-1;
885 		outer = 0;
886 		c_loop = 0;
887 	} else if (afi_clocks <= 0x10000) {
888 		inner = 0xff;
889 		outer = (afi_clocks-1) >> 8;
890 		c_loop = 0;
891 	} else {
892 		inner = 0xff;
893 		outer = 0xff;
894 		c_loop = (afi_clocks-1) >> 16;
895 	}
896 
897 	/*
898 	 * rom instructions are structured as follows:
899 	 *
900 	 *    IDLE_LOOP2: jnz cntr0, TARGET_A
901 	 *    IDLE_LOOP1: jnz cntr1, TARGET_B
902 	 *                return
903 	 *
904 	 * so, when doing nested loops, TARGET_A is set to IDLE_LOOP2, and
905 	 * TARGET_B is set to IDLE_LOOP2 as well
906 	 *
907 	 * if we have no outer loop, though, then we can use IDLE_LOOP1 only,
908 	 * and set TARGET_B to IDLE_LOOP1 and we skip IDLE_LOOP2 entirely
909 	 *
910 	 * a little confusing, but it helps save precious space in the inst_rom
911 	 * and sequencer rom and keeps the delays more accurate and reduces
912 	 * overhead
913 	 */
914 	if (afi_clocks <= 0x100) {
915 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr1;
916 		writel(SKIP_DELAY_LOOP_VALUE_OR_ZERO(inner), addr);
917 
918 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add1;
919 		writel(RW_MGR_IDLE_LOOP1, addr);
920 
921 		addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
922 		writel(RW_MGR_IDLE_LOOP1, addr);
923 	} else {
924 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr0;
925 		writel(SKIP_DELAY_LOOP_VALUE_OR_ZERO(inner), addr);
926 
927 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr1;
928 		writel(SKIP_DELAY_LOOP_VALUE_OR_ZERO(outer), addr);
929 
930 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add0;
931 		writel(RW_MGR_IDLE_LOOP2, addr);
932 
933 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add1;
934 		writel(RW_MGR_IDLE_LOOP2, addr);
935 
936 		/* hack to get around compiler not being smart enough */
937 		if (afi_clocks <= 0x10000) {
938 			/* only need to run once */
939 			addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
940 			writel(RW_MGR_IDLE_LOOP2, addr);
941 		} else {
942 			do {
943 				addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
944 				writel(RW_MGR_IDLE_LOOP2, addr);
945 			} while (c_loop-- != 0);
946 		}
947 	}
948 	debug("%s:%d clocks=%u ... end\n", __func__, __LINE__, clocks);
949 }
950 
951 static void rw_mgr_mem_initialize(void)
952 {
953 	uint32_t r;
954 	uint32_t addr;
955 
956 	debug("%s:%d\n", __func__, __LINE__);
957 
958 	/* The reset / cke part of initialization is broadcasted to all ranks */
959 	addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_SET_CS_AND_ODT_MASK_OFFSET;
960 	writel(RW_MGR_RANK_ALL, addr);
961 
962 	/*
963 	 * Here's how you load register for a loop
964 	 * Counters are located @ 0x800
965 	 * Jump address are located @ 0xC00
966 	 * For both, registers 0 to 3 are selected using bits 3 and 2, like
967 	 * in 0x800, 0x804, 0x808, 0x80C and 0xC00, 0xC04, 0xC08, 0xC0C
968 	 * I know this ain't pretty, but Avalon bus throws away the 2 least
969 	 * significant bits
970 	 */
971 
972 	/* start with memory RESET activated */
973 
974 	/* tINIT = 200us */
975 
976 	/*
977 	 * 200us @ 266MHz (3.75 ns) ~ 54000 clock cycles
978 	 * If a and b are the number of iteration in 2 nested loops
979 	 * it takes the following number of cycles to complete the operation:
980 	 * number_of_cycles = ((2 + n) * a + 2) * b
981 	 * where n is the number of instruction in the inner loop
982 	 * One possible solution is n = 0 , a = 256 , b = 106 => a = FF,
983 	 * b = 6A
984 	 */
985 
986 	/* Load counters */
987 	addr = (u32)&sdr_rw_load_mgr_regs->load_cntr0;
988 	writel(SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TINIT_CNTR0_VAL),
989 	       addr);
990 	addr = (u32)&sdr_rw_load_mgr_regs->load_cntr1;
991 	writel(SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TINIT_CNTR1_VAL),
992 	       addr);
993 	addr = (u32)&sdr_rw_load_mgr_regs->load_cntr2;
994 	writel(SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TINIT_CNTR2_VAL),
995 	       addr);
996 
997 	/* Load jump address */
998 	addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add0;
999 	writel(RW_MGR_INIT_RESET_0_CKE_0, addr);
1000 
1001 	addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add1;
1002 	writel(RW_MGR_INIT_RESET_0_CKE_0, addr);
1003 
1004 	addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add2;
1005 	writel(RW_MGR_INIT_RESET_0_CKE_0, addr);
1006 
1007 	/* Execute count instruction */
1008 	addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
1009 	writel(RW_MGR_INIT_RESET_0_CKE_0, addr);
1010 
1011 	/* indicate that memory is stable */
1012 	addr = (u32)&phy_mgr_cfg->reset_mem_stbl;
1013 	writel(1, addr);
1014 
1015 	/*
1016 	 * transition the RESET to high
1017 	 * Wait for 500us
1018 	 */
1019 
1020 	/*
1021 	 * 500us @ 266MHz (3.75 ns) ~ 134000 clock cycles
1022 	 * If a and b are the number of iteration in 2 nested loops
1023 	 * it takes the following number of cycles to complete the operation
1024 	 * number_of_cycles = ((2 + n) * a + 2) * b
1025 	 * where n is the number of instruction in the inner loop
1026 	 * One possible solution is n = 2 , a = 131 , b = 256 => a = 83,
1027 	 * b = FF
1028 	 */
1029 
1030 	/* Load counters */
1031 	addr = (u32)&sdr_rw_load_mgr_regs->load_cntr0;
1032 	writel(SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TRESET_CNTR0_VAL),
1033 	       addr);
1034 	addr = (u32)&sdr_rw_load_mgr_regs->load_cntr1;
1035 	writel(SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TRESET_CNTR1_VAL),
1036 	       addr);
1037 	addr = (u32)&sdr_rw_load_mgr_regs->load_cntr2;
1038 	writel(SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TRESET_CNTR2_VAL),
1039 	       addr);
1040 
1041 	/* Load jump address */
1042 	addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add0;
1043 	writel(RW_MGR_INIT_RESET_1_CKE_0, addr);
1044 	addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add1;
1045 	writel(RW_MGR_INIT_RESET_1_CKE_0, addr);
1046 	addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add2;
1047 	writel(RW_MGR_INIT_RESET_1_CKE_0, addr);
1048 
1049 	addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
1050 	writel(RW_MGR_INIT_RESET_1_CKE_0, addr);
1051 
1052 	/* bring up clock enable */
1053 
1054 	/* tXRP < 250 ck cycles */
1055 	delay_for_n_mem_clocks(250);
1056 
1057 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r++) {
1058 		if (param->skip_ranks[r]) {
1059 			/* request to skip the rank */
1060 			continue;
1061 		}
1062 
1063 		/* set rank */
1064 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_OFF);
1065 
1066 		/*
1067 		 * USER Use Mirror-ed commands for odd ranks if address
1068 		 * mirrorring is on
1069 		 */
1070 		if ((RW_MGR_MEM_ADDRESS_MIRRORING >> r) & 0x1) {
1071 			set_jump_as_return();
1072 			addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
1073 			writel(RW_MGR_MRS2_MIRR, addr);
1074 			delay_for_n_mem_clocks(4);
1075 			set_jump_as_return();
1076 			writel(RW_MGR_MRS3_MIRR, addr);
1077 			delay_for_n_mem_clocks(4);
1078 			set_jump_as_return();
1079 			writel(RW_MGR_MRS1_MIRR, addr);
1080 			delay_for_n_mem_clocks(4);
1081 			set_jump_as_return();
1082 			writel(RW_MGR_MRS0_DLL_RESET_MIRR, addr);
1083 		} else {
1084 			set_jump_as_return();
1085 			addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
1086 			writel(RW_MGR_MRS2, addr);
1087 			delay_for_n_mem_clocks(4);
1088 			set_jump_as_return();
1089 			writel(RW_MGR_MRS3, addr);
1090 			delay_for_n_mem_clocks(4);
1091 			set_jump_as_return();
1092 			writel(RW_MGR_MRS1, addr);
1093 			set_jump_as_return();
1094 			writel(RW_MGR_MRS0_DLL_RESET, addr);
1095 		}
1096 		set_jump_as_return();
1097 		addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
1098 		writel(RW_MGR_ZQCL, addr);
1099 
1100 		/* tZQinit = tDLLK = 512 ck cycles */
1101 		delay_for_n_mem_clocks(512);
1102 	}
1103 }
1104 
1105 /*
1106  * At the end of calibration we have to program the user settings in, and
1107  * USER  hand off the memory to the user.
1108  */
1109 static void rw_mgr_mem_handoff(void)
1110 {
1111 	uint32_t r;
1112 	uint32_t addr;
1113 
1114 	debug("%s:%d\n", __func__, __LINE__);
1115 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r++) {
1116 		if (param->skip_ranks[r])
1117 			/* request to skip the rank */
1118 			continue;
1119 		/* set rank */
1120 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_OFF);
1121 
1122 		/* precharge all banks ... */
1123 		addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
1124 		writel(RW_MGR_PRECHARGE_ALL, addr);
1125 
1126 		/* load up MR settings specified by user */
1127 
1128 		/*
1129 		 * Use Mirror-ed commands for odd ranks if address
1130 		 * mirrorring is on
1131 		 */
1132 		addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
1133 		if ((RW_MGR_MEM_ADDRESS_MIRRORING >> r) & 0x1) {
1134 			set_jump_as_return();
1135 			writel(RW_MGR_MRS2_MIRR, addr);
1136 			delay_for_n_mem_clocks(4);
1137 			set_jump_as_return();
1138 			writel(RW_MGR_MRS3_MIRR, addr);
1139 			delay_for_n_mem_clocks(4);
1140 			set_jump_as_return();
1141 			writel(RW_MGR_MRS1_MIRR, addr);
1142 			delay_for_n_mem_clocks(4);
1143 			set_jump_as_return();
1144 			writel(RW_MGR_MRS0_USER_MIRR, addr);
1145 		} else {
1146 			set_jump_as_return();
1147 			writel(RW_MGR_MRS2, addr);
1148 			delay_for_n_mem_clocks(4);
1149 			set_jump_as_return();
1150 			writel(RW_MGR_MRS3, addr);
1151 			delay_for_n_mem_clocks(4);
1152 			set_jump_as_return();
1153 			writel(RW_MGR_MRS1, addr);
1154 			delay_for_n_mem_clocks(4);
1155 			set_jump_as_return();
1156 			writel(RW_MGR_MRS0_USER, addr);
1157 		}
1158 		/*
1159 		 * USER  need to wait tMOD (12CK or 15ns) time before issuing
1160 		 * other commands, but we will have plenty of NIOS cycles before
1161 		 * actual handoff so its okay.
1162 		 */
1163 	}
1164 }
1165 
1166 /*
1167  * performs a guaranteed read on the patterns we are going to use during a
1168  * read test to ensure memory works
1169  */
1170 static uint32_t rw_mgr_mem_calibrate_read_test_patterns(uint32_t rank_bgn,
1171 	uint32_t group, uint32_t num_tries, uint32_t *bit_chk,
1172 	uint32_t all_ranks)
1173 {
1174 	uint32_t r, vg;
1175 	uint32_t correct_mask_vg;
1176 	uint32_t tmp_bit_chk;
1177 	uint32_t rank_end = all_ranks ? RW_MGR_MEM_NUMBER_OF_RANKS :
1178 		(rank_bgn + NUM_RANKS_PER_SHADOW_REG);
1179 	uint32_t addr;
1180 	uint32_t base_rw_mgr;
1181 
1182 	*bit_chk = param->read_correct_mask;
1183 	correct_mask_vg = param->read_correct_mask_vg;
1184 
1185 	for (r = rank_bgn; r < rank_end; r++) {
1186 		if (param->skip_ranks[r])
1187 			/* request to skip the rank */
1188 			continue;
1189 
1190 		/* set rank */
1191 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_READ_WRITE);
1192 
1193 		/* Load up a constant bursts of read commands */
1194 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr0;
1195 		writel(0x20, addr);
1196 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add0;
1197 		writel(RW_MGR_GUARANTEED_READ, addr);
1198 
1199 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr1;
1200 		writel(0x20, addr);
1201 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add1;
1202 		writel(RW_MGR_GUARANTEED_READ_CONT, addr);
1203 
1204 		tmp_bit_chk = 0;
1205 		for (vg = RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS-1; ; vg--) {
1206 			/* reset the fifos to get pointers to known state */
1207 
1208 			addr = (u32)&phy_mgr_cmd->fifo_reset;
1209 			writel(0, addr);
1210 			addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RESET_READ_DATAPATH_OFFSET;
1211 			writel(0, addr);
1212 
1213 			tmp_bit_chk = tmp_bit_chk << (RW_MGR_MEM_DQ_PER_READ_DQS
1214 				/ RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS);
1215 
1216 			addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
1217 			writel(RW_MGR_GUARANTEED_READ, addr +
1218 			       ((group * RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS +
1219 				vg) << 2));
1220 
1221 			addr = SDR_PHYGRP_RWMGRGRP_ADDRESS;
1222 			base_rw_mgr = readl(addr);
1223 			tmp_bit_chk = tmp_bit_chk | (correct_mask_vg & (~base_rw_mgr));
1224 
1225 			if (vg == 0)
1226 				break;
1227 		}
1228 		*bit_chk &= tmp_bit_chk;
1229 	}
1230 
1231 	addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
1232 	writel(RW_MGR_CLEAR_DQS_ENABLE, addr + (group << 2));
1233 
1234 	set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
1235 	debug_cond(DLEVEL == 1, "%s:%d test_load_patterns(%u,ALL) => (%u == %u) =>\
1236 		   %lu\n", __func__, __LINE__, group, *bit_chk, param->read_correct_mask,
1237 		   (long unsigned int)(*bit_chk == param->read_correct_mask));
1238 	return *bit_chk == param->read_correct_mask;
1239 }
1240 
1241 static uint32_t rw_mgr_mem_calibrate_read_test_patterns_all_ranks
1242 	(uint32_t group, uint32_t num_tries, uint32_t *bit_chk)
1243 {
1244 	return rw_mgr_mem_calibrate_read_test_patterns(0, group,
1245 		num_tries, bit_chk, 1);
1246 }
1247 
1248 /* load up the patterns we are going to use during a read test */
1249 static void rw_mgr_mem_calibrate_read_load_patterns(uint32_t rank_bgn,
1250 	uint32_t all_ranks)
1251 {
1252 	uint32_t r;
1253 	uint32_t addr;
1254 	uint32_t rank_end = all_ranks ? RW_MGR_MEM_NUMBER_OF_RANKS :
1255 		(rank_bgn + NUM_RANKS_PER_SHADOW_REG);
1256 
1257 	debug("%s:%d\n", __func__, __LINE__);
1258 	for (r = rank_bgn; r < rank_end; r++) {
1259 		if (param->skip_ranks[r])
1260 			/* request to skip the rank */
1261 			continue;
1262 
1263 		/* set rank */
1264 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_READ_WRITE);
1265 
1266 		/* Load up a constant bursts */
1267 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr0;
1268 		writel(0x20, addr);
1269 
1270 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add0;
1271 		writel(RW_MGR_GUARANTEED_WRITE_WAIT0, addr);
1272 
1273 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr1;
1274 		writel(0x20, addr);
1275 
1276 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add1;
1277 		writel(RW_MGR_GUARANTEED_WRITE_WAIT1, addr);
1278 
1279 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr2;
1280 		writel(0x04, addr);
1281 
1282 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add2;
1283 		writel(RW_MGR_GUARANTEED_WRITE_WAIT2, addr);
1284 
1285 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr3;
1286 		writel(0x04, addr);
1287 
1288 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add3;
1289 		writel(RW_MGR_GUARANTEED_WRITE_WAIT3, addr);
1290 
1291 		addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
1292 		writel(RW_MGR_GUARANTEED_WRITE, addr);
1293 	}
1294 
1295 	set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
1296 }
1297 
1298 /*
1299  * try a read and see if it returns correct data back. has dummy reads
1300  * inserted into the mix used to align dqs enable. has more thorough checks
1301  * than the regular read test.
1302  */
1303 static uint32_t rw_mgr_mem_calibrate_read_test(uint32_t rank_bgn, uint32_t group,
1304 	uint32_t num_tries, uint32_t all_correct, uint32_t *bit_chk,
1305 	uint32_t all_groups, uint32_t all_ranks)
1306 {
1307 	uint32_t r, vg;
1308 	uint32_t correct_mask_vg;
1309 	uint32_t tmp_bit_chk;
1310 	uint32_t rank_end = all_ranks ? RW_MGR_MEM_NUMBER_OF_RANKS :
1311 		(rank_bgn + NUM_RANKS_PER_SHADOW_REG);
1312 	uint32_t addr;
1313 	uint32_t base_rw_mgr;
1314 
1315 	*bit_chk = param->read_correct_mask;
1316 	correct_mask_vg = param->read_correct_mask_vg;
1317 
1318 	uint32_t quick_read_mode = (((STATIC_CALIB_STEPS) &
1319 		CALIB_SKIP_DELAY_SWEEPS) && ENABLE_SUPER_QUICK_CALIBRATION);
1320 
1321 	for (r = rank_bgn; r < rank_end; r++) {
1322 		if (param->skip_ranks[r])
1323 			/* request to skip the rank */
1324 			continue;
1325 
1326 		/* set rank */
1327 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_READ_WRITE);
1328 
1329 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr1;
1330 		writel(0x10, addr);
1331 
1332 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add1;
1333 		writel(RW_MGR_READ_B2B_WAIT1, addr);
1334 
1335 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr2;
1336 		writel(0x10, addr);
1337 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add2;
1338 		writel(RW_MGR_READ_B2B_WAIT2, addr);
1339 
1340 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr0;
1341 		if (quick_read_mode)
1342 			writel(0x1, addr);
1343 			/* need at least two (1+1) reads to capture failures */
1344 		else if (all_groups)
1345 			writel(0x06, addr);
1346 		else
1347 			writel(0x32, addr);
1348 
1349 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add0;
1350 		writel(RW_MGR_READ_B2B, addr);
1351 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr3;
1352 		if (all_groups)
1353 			writel(RW_MGR_MEM_IF_READ_DQS_WIDTH *
1354 			       RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS - 1,
1355 			       addr);
1356 		else
1357 			writel(0x0, addr);
1358 
1359 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add3;
1360 		writel(RW_MGR_READ_B2B, addr);
1361 
1362 		tmp_bit_chk = 0;
1363 		for (vg = RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS-1; ; vg--) {
1364 			/* reset the fifos to get pointers to known state */
1365 			addr = (u32)&phy_mgr_cmd->fifo_reset;
1366 			writel(0, addr);
1367 			addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RESET_READ_DATAPATH_OFFSET;
1368 			writel(0, addr);
1369 
1370 			tmp_bit_chk = tmp_bit_chk << (RW_MGR_MEM_DQ_PER_READ_DQS
1371 				/ RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS);
1372 
1373 			if (all_groups)
1374 				addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_ALL_GROUPS_OFFSET;
1375 			else
1376 				addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
1377 
1378 			writel(RW_MGR_READ_B2B, addr +
1379 			       ((group * RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS +
1380 			       vg) << 2));
1381 
1382 			addr = SDR_PHYGRP_RWMGRGRP_ADDRESS;
1383 			base_rw_mgr = readl(addr);
1384 			tmp_bit_chk = tmp_bit_chk | (correct_mask_vg & ~(base_rw_mgr));
1385 
1386 			if (vg == 0)
1387 				break;
1388 		}
1389 		*bit_chk &= tmp_bit_chk;
1390 	}
1391 
1392 	addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
1393 	writel(RW_MGR_CLEAR_DQS_ENABLE, addr + (group << 2));
1394 
1395 	if (all_correct) {
1396 		set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
1397 		debug_cond(DLEVEL == 2, "%s:%d read_test(%u,ALL,%u) =>\
1398 			   (%u == %u) => %lu", __func__, __LINE__, group,
1399 			   all_groups, *bit_chk, param->read_correct_mask,
1400 			   (long unsigned int)(*bit_chk ==
1401 			   param->read_correct_mask));
1402 		return *bit_chk == param->read_correct_mask;
1403 	} else	{
1404 		set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
1405 		debug_cond(DLEVEL == 2, "%s:%d read_test(%u,ONE,%u) =>\
1406 			   (%u != %lu) => %lu\n", __func__, __LINE__,
1407 			   group, all_groups, *bit_chk, (long unsigned int)0,
1408 			   (long unsigned int)(*bit_chk != 0x00));
1409 		return *bit_chk != 0x00;
1410 	}
1411 }
1412 
1413 static uint32_t rw_mgr_mem_calibrate_read_test_all_ranks(uint32_t group,
1414 	uint32_t num_tries, uint32_t all_correct, uint32_t *bit_chk,
1415 	uint32_t all_groups)
1416 {
1417 	return rw_mgr_mem_calibrate_read_test(0, group, num_tries, all_correct,
1418 					      bit_chk, all_groups, 1);
1419 }
1420 
1421 static void rw_mgr_incr_vfifo(uint32_t grp, uint32_t *v)
1422 {
1423 	uint32_t addr = (u32)&phy_mgr_cmd->inc_vfifo_hard_phy;
1424 
1425 	writel(grp, addr);
1426 	(*v)++;
1427 }
1428 
1429 static void rw_mgr_decr_vfifo(uint32_t grp, uint32_t *v)
1430 {
1431 	uint32_t i;
1432 
1433 	for (i = 0; i < VFIFO_SIZE-1; i++)
1434 		rw_mgr_incr_vfifo(grp, v);
1435 }
1436 
1437 static int find_vfifo_read(uint32_t grp, uint32_t *bit_chk)
1438 {
1439 	uint32_t  v;
1440 	uint32_t fail_cnt = 0;
1441 	uint32_t test_status;
1442 
1443 	for (v = 0; v < VFIFO_SIZE; ) {
1444 		debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: vfifo %u\n",
1445 			   __func__, __LINE__, v);
1446 		test_status = rw_mgr_mem_calibrate_read_test_all_ranks
1447 			(grp, 1, PASS_ONE_BIT, bit_chk, 0);
1448 		if (!test_status) {
1449 			fail_cnt++;
1450 
1451 			if (fail_cnt == 2)
1452 				break;
1453 		}
1454 
1455 		/* fiddle with FIFO */
1456 		rw_mgr_incr_vfifo(grp, &v);
1457 	}
1458 
1459 	if (v >= VFIFO_SIZE) {
1460 		/* no failing read found!! Something must have gone wrong */
1461 		debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: vfifo failed\n",
1462 			   __func__, __LINE__);
1463 		return 0;
1464 	} else {
1465 		return v;
1466 	}
1467 }
1468 
1469 static int find_working_phase(uint32_t *grp, uint32_t *bit_chk,
1470 			      uint32_t dtaps_per_ptap, uint32_t *work_bgn,
1471 			      uint32_t *v, uint32_t *d, uint32_t *p,
1472 			      uint32_t *i, uint32_t *max_working_cnt)
1473 {
1474 	uint32_t found_begin = 0;
1475 	uint32_t tmp_delay = 0;
1476 	uint32_t test_status;
1477 
1478 	for (*d = 0; *d <= dtaps_per_ptap; (*d)++, tmp_delay +=
1479 		IO_DELAY_PER_DQS_EN_DCHAIN_TAP) {
1480 		*work_bgn = tmp_delay;
1481 		scc_mgr_set_dqs_en_delay_all_ranks(*grp, *d);
1482 
1483 		for (*i = 0; *i < VFIFO_SIZE; (*i)++) {
1484 			for (*p = 0; *p <= IO_DQS_EN_PHASE_MAX; (*p)++, *work_bgn +=
1485 				IO_DELAY_PER_OPA_TAP) {
1486 				scc_mgr_set_dqs_en_phase_all_ranks(*grp, *p);
1487 
1488 				test_status =
1489 				rw_mgr_mem_calibrate_read_test_all_ranks
1490 				(*grp, 1, PASS_ONE_BIT, bit_chk, 0);
1491 
1492 				if (test_status) {
1493 					*max_working_cnt = 1;
1494 					found_begin = 1;
1495 					break;
1496 				}
1497 			}
1498 
1499 			if (found_begin)
1500 				break;
1501 
1502 			if (*p > IO_DQS_EN_PHASE_MAX)
1503 				/* fiddle with FIFO */
1504 				rw_mgr_incr_vfifo(*grp, v);
1505 		}
1506 
1507 		if (found_begin)
1508 			break;
1509 	}
1510 
1511 	if (*i >= VFIFO_SIZE) {
1512 		/* cannot find working solution */
1513 		debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: no vfifo/\
1514 			   ptap/dtap\n", __func__, __LINE__);
1515 		return 0;
1516 	} else {
1517 		return 1;
1518 	}
1519 }
1520 
1521 static void sdr_backup_phase(uint32_t *grp, uint32_t *bit_chk,
1522 			     uint32_t *work_bgn, uint32_t *v, uint32_t *d,
1523 			     uint32_t *p, uint32_t *max_working_cnt)
1524 {
1525 	uint32_t found_begin = 0;
1526 	uint32_t tmp_delay;
1527 
1528 	/* Special case code for backing up a phase */
1529 	if (*p == 0) {
1530 		*p = IO_DQS_EN_PHASE_MAX;
1531 		rw_mgr_decr_vfifo(*grp, v);
1532 	} else {
1533 		(*p)--;
1534 	}
1535 	tmp_delay = *work_bgn - IO_DELAY_PER_OPA_TAP;
1536 	scc_mgr_set_dqs_en_phase_all_ranks(*grp, *p);
1537 
1538 	for (*d = 0; *d <= IO_DQS_EN_DELAY_MAX && tmp_delay < *work_bgn;
1539 		(*d)++, tmp_delay += IO_DELAY_PER_DQS_EN_DCHAIN_TAP) {
1540 		scc_mgr_set_dqs_en_delay_all_ranks(*grp, *d);
1541 
1542 		if (rw_mgr_mem_calibrate_read_test_all_ranks(*grp, 1,
1543 							     PASS_ONE_BIT,
1544 							     bit_chk, 0)) {
1545 			found_begin = 1;
1546 			*work_bgn = tmp_delay;
1547 			break;
1548 		}
1549 	}
1550 
1551 	/* We have found a working dtap before the ptap found above */
1552 	if (found_begin == 1)
1553 		(*max_working_cnt)++;
1554 
1555 	/*
1556 	 * Restore VFIFO to old state before we decremented it
1557 	 * (if needed).
1558 	 */
1559 	(*p)++;
1560 	if (*p > IO_DQS_EN_PHASE_MAX) {
1561 		*p = 0;
1562 		rw_mgr_incr_vfifo(*grp, v);
1563 	}
1564 
1565 	scc_mgr_set_dqs_en_delay_all_ranks(*grp, 0);
1566 }
1567 
1568 static int sdr_nonworking_phase(uint32_t *grp, uint32_t *bit_chk,
1569 			     uint32_t *work_bgn, uint32_t *v, uint32_t *d,
1570 			     uint32_t *p, uint32_t *i, uint32_t *max_working_cnt,
1571 			     uint32_t *work_end)
1572 {
1573 	uint32_t found_end = 0;
1574 
1575 	(*p)++;
1576 	*work_end += IO_DELAY_PER_OPA_TAP;
1577 	if (*p > IO_DQS_EN_PHASE_MAX) {
1578 		/* fiddle with FIFO */
1579 		*p = 0;
1580 		rw_mgr_incr_vfifo(*grp, v);
1581 	}
1582 
1583 	for (; *i < VFIFO_SIZE + 1; (*i)++) {
1584 		for (; *p <= IO_DQS_EN_PHASE_MAX; (*p)++, *work_end
1585 			+= IO_DELAY_PER_OPA_TAP) {
1586 			scc_mgr_set_dqs_en_phase_all_ranks(*grp, *p);
1587 
1588 			if (!rw_mgr_mem_calibrate_read_test_all_ranks
1589 				(*grp, 1, PASS_ONE_BIT, bit_chk, 0)) {
1590 				found_end = 1;
1591 				break;
1592 			} else {
1593 				(*max_working_cnt)++;
1594 			}
1595 		}
1596 
1597 		if (found_end)
1598 			break;
1599 
1600 		if (*p > IO_DQS_EN_PHASE_MAX) {
1601 			/* fiddle with FIFO */
1602 			rw_mgr_incr_vfifo(*grp, v);
1603 			*p = 0;
1604 		}
1605 	}
1606 
1607 	if (*i >= VFIFO_SIZE + 1) {
1608 		/* cannot see edge of failing read */
1609 		debug_cond(DLEVEL == 2, "%s:%d sdr_nonworking_phase: end:\
1610 			   failed\n", __func__, __LINE__);
1611 		return 0;
1612 	} else {
1613 		return 1;
1614 	}
1615 }
1616 
1617 static int sdr_find_window_centre(uint32_t *grp, uint32_t *bit_chk,
1618 				  uint32_t *work_bgn, uint32_t *v, uint32_t *d,
1619 				  uint32_t *p, uint32_t *work_mid,
1620 				  uint32_t *work_end)
1621 {
1622 	int i;
1623 	int tmp_delay = 0;
1624 
1625 	*work_mid = (*work_bgn + *work_end) / 2;
1626 
1627 	debug_cond(DLEVEL == 2, "work_bgn=%d work_end=%d work_mid=%d\n",
1628 		   *work_bgn, *work_end, *work_mid);
1629 	/* Get the middle delay to be less than a VFIFO delay */
1630 	for (*p = 0; *p <= IO_DQS_EN_PHASE_MAX;
1631 		(*p)++, tmp_delay += IO_DELAY_PER_OPA_TAP)
1632 		;
1633 	debug_cond(DLEVEL == 2, "vfifo ptap delay %d\n", tmp_delay);
1634 	while (*work_mid > tmp_delay)
1635 		*work_mid -= tmp_delay;
1636 	debug_cond(DLEVEL == 2, "new work_mid %d\n", *work_mid);
1637 
1638 	tmp_delay = 0;
1639 	for (*p = 0; *p <= IO_DQS_EN_PHASE_MAX && tmp_delay < *work_mid;
1640 		(*p)++, tmp_delay += IO_DELAY_PER_OPA_TAP)
1641 		;
1642 	tmp_delay -= IO_DELAY_PER_OPA_TAP;
1643 	debug_cond(DLEVEL == 2, "new p %d, tmp_delay=%d\n", (*p) - 1, tmp_delay);
1644 	for (*d = 0; *d <= IO_DQS_EN_DELAY_MAX && tmp_delay < *work_mid; (*d)++,
1645 		tmp_delay += IO_DELAY_PER_DQS_EN_DCHAIN_TAP)
1646 		;
1647 	debug_cond(DLEVEL == 2, "new d %d, tmp_delay=%d\n", *d, tmp_delay);
1648 
1649 	scc_mgr_set_dqs_en_phase_all_ranks(*grp, (*p) - 1);
1650 	scc_mgr_set_dqs_en_delay_all_ranks(*grp, *d);
1651 
1652 	/*
1653 	 * push vfifo until we can successfully calibrate. We can do this
1654 	 * because the largest possible margin in 1 VFIFO cycle.
1655 	 */
1656 	for (i = 0; i < VFIFO_SIZE; i++) {
1657 		debug_cond(DLEVEL == 2, "find_dqs_en_phase: center: vfifo=%u\n",
1658 			   *v);
1659 		if (rw_mgr_mem_calibrate_read_test_all_ranks(*grp, 1,
1660 							     PASS_ONE_BIT,
1661 							     bit_chk, 0)) {
1662 			break;
1663 		}
1664 
1665 		/* fiddle with FIFO */
1666 		rw_mgr_incr_vfifo(*grp, v);
1667 	}
1668 
1669 	if (i >= VFIFO_SIZE) {
1670 		debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: center: \
1671 			   failed\n", __func__, __LINE__);
1672 		return 0;
1673 	} else {
1674 		return 1;
1675 	}
1676 }
1677 
1678 /* find a good dqs enable to use */
1679 static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase(uint32_t grp)
1680 {
1681 	uint32_t v, d, p, i;
1682 	uint32_t max_working_cnt;
1683 	uint32_t bit_chk;
1684 	uint32_t dtaps_per_ptap;
1685 	uint32_t work_bgn, work_mid, work_end;
1686 	uint32_t found_passing_read, found_failing_read, initial_failing_dtap;
1687 	uint32_t addr;
1688 
1689 	debug("%s:%d %u\n", __func__, __LINE__, grp);
1690 
1691 	reg_file_set_sub_stage(CAL_SUBSTAGE_VFIFO_CENTER);
1692 
1693 	scc_mgr_set_dqs_en_delay_all_ranks(grp, 0);
1694 	scc_mgr_set_dqs_en_phase_all_ranks(grp, 0);
1695 
1696 	/* ************************************************************** */
1697 	/* * Step 0 : Determine number of delay taps for each phase tap * */
1698 	dtaps_per_ptap = IO_DELAY_PER_OPA_TAP/IO_DELAY_PER_DQS_EN_DCHAIN_TAP;
1699 
1700 	/* ********************************************************* */
1701 	/* * Step 1 : First push vfifo until we get a failing read * */
1702 	v = find_vfifo_read(grp, &bit_chk);
1703 
1704 	max_working_cnt = 0;
1705 
1706 	/* ******************************************************** */
1707 	/* * step 2: find first working phase, increment in ptaps * */
1708 	work_bgn = 0;
1709 	if (find_working_phase(&grp, &bit_chk, dtaps_per_ptap, &work_bgn, &v, &d,
1710 				&p, &i, &max_working_cnt) == 0)
1711 		return 0;
1712 
1713 	work_end = work_bgn;
1714 
1715 	/*
1716 	 * If d is 0 then the working window covers a phase tap and
1717 	 * we can follow the old procedure otherwise, we've found the beginning,
1718 	 * and we need to increment the dtaps until we find the end.
1719 	 */
1720 	if (d == 0) {
1721 		/* ********************************************************* */
1722 		/* * step 3a: if we have room, back off by one and
1723 		increment in dtaps * */
1724 
1725 		sdr_backup_phase(&grp, &bit_chk, &work_bgn, &v, &d, &p,
1726 				 &max_working_cnt);
1727 
1728 		/* ********************************************************* */
1729 		/* * step 4a: go forward from working phase to non working
1730 		phase, increment in ptaps * */
1731 		if (sdr_nonworking_phase(&grp, &bit_chk, &work_bgn, &v, &d, &p,
1732 					 &i, &max_working_cnt, &work_end) == 0)
1733 			return 0;
1734 
1735 		/* ********************************************************* */
1736 		/* * step 5a:  back off one from last, increment in dtaps  * */
1737 
1738 		/* Special case code for backing up a phase */
1739 		if (p == 0) {
1740 			p = IO_DQS_EN_PHASE_MAX;
1741 			rw_mgr_decr_vfifo(grp, &v);
1742 		} else {
1743 			p = p - 1;
1744 		}
1745 
1746 		work_end -= IO_DELAY_PER_OPA_TAP;
1747 		scc_mgr_set_dqs_en_phase_all_ranks(grp, p);
1748 
1749 		/* * The actual increment of dtaps is done outside of
1750 		the if/else loop to share code */
1751 		d = 0;
1752 
1753 		debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: v/p: \
1754 			   vfifo=%u ptap=%u\n", __func__, __LINE__,
1755 			   v, p);
1756 	} else {
1757 		/* ******************************************************* */
1758 		/* * step 3-5b:  Find the right edge of the window using
1759 		delay taps   * */
1760 		debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase:vfifo=%u \
1761 			   ptap=%u dtap=%u bgn=%u\n", __func__, __LINE__,
1762 			   v, p, d, work_bgn);
1763 
1764 		work_end = work_bgn;
1765 
1766 		/* * The actual increment of dtaps is done outside of the
1767 		if/else loop to share code */
1768 
1769 		/* Only here to counterbalance a subtract later on which is
1770 		not needed if this branch of the algorithm is taken */
1771 		max_working_cnt++;
1772 	}
1773 
1774 	/* The dtap increment to find the failing edge is done here */
1775 	for (; d <= IO_DQS_EN_DELAY_MAX; d++, work_end +=
1776 		IO_DELAY_PER_DQS_EN_DCHAIN_TAP) {
1777 			debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: \
1778 				   end-2: dtap=%u\n", __func__, __LINE__, d);
1779 			scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
1780 
1781 			if (!rw_mgr_mem_calibrate_read_test_all_ranks(grp, 1,
1782 								      PASS_ONE_BIT,
1783 								      &bit_chk, 0)) {
1784 				break;
1785 			}
1786 	}
1787 
1788 	/* Go back to working dtap */
1789 	if (d != 0)
1790 		work_end -= IO_DELAY_PER_DQS_EN_DCHAIN_TAP;
1791 
1792 	debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: v/p/d: vfifo=%u \
1793 		   ptap=%u dtap=%u end=%u\n", __func__, __LINE__,
1794 		   v, p, d-1, work_end);
1795 
1796 	if (work_end < work_bgn) {
1797 		/* nil range */
1798 		debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: end-2: \
1799 			   failed\n", __func__, __LINE__);
1800 		return 0;
1801 	}
1802 
1803 	debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: found range [%u,%u]\n",
1804 		   __func__, __LINE__, work_bgn, work_end);
1805 
1806 	/* *************************************************************** */
1807 	/*
1808 	 * * We need to calculate the number of dtaps that equal a ptap
1809 	 * * To do that we'll back up a ptap and re-find the edge of the
1810 	 * * window using dtaps
1811 	 */
1812 
1813 	debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: calculate dtaps_per_ptap \
1814 		   for tracking\n", __func__, __LINE__);
1815 
1816 	/* Special case code for backing up a phase */
1817 	if (p == 0) {
1818 		p = IO_DQS_EN_PHASE_MAX;
1819 		rw_mgr_decr_vfifo(grp, &v);
1820 		debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: backedup \
1821 			   cycle/phase: v=%u p=%u\n", __func__, __LINE__,
1822 			   v, p);
1823 	} else {
1824 		p = p - 1;
1825 		debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: backedup \
1826 			   phase only: v=%u p=%u", __func__, __LINE__,
1827 			   v, p);
1828 	}
1829 
1830 	scc_mgr_set_dqs_en_phase_all_ranks(grp, p);
1831 
1832 	/*
1833 	 * Increase dtap until we first see a passing read (in case the
1834 	 * window is smaller than a ptap),
1835 	 * and then a failing read to mark the edge of the window again
1836 	 */
1837 
1838 	/* Find a passing read */
1839 	debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: find passing read\n",
1840 		   __func__, __LINE__);
1841 	found_passing_read = 0;
1842 	found_failing_read = 0;
1843 	initial_failing_dtap = d;
1844 	for (; d <= IO_DQS_EN_DELAY_MAX; d++) {
1845 		debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: testing \
1846 			   read d=%u\n", __func__, __LINE__, d);
1847 		scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
1848 
1849 		if (rw_mgr_mem_calibrate_read_test_all_ranks(grp, 1,
1850 							     PASS_ONE_BIT,
1851 							     &bit_chk, 0)) {
1852 			found_passing_read = 1;
1853 			break;
1854 		}
1855 	}
1856 
1857 	if (found_passing_read) {
1858 		/* Find a failing read */
1859 		debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: find failing \
1860 			   read\n", __func__, __LINE__);
1861 		for (d = d + 1; d <= IO_DQS_EN_DELAY_MAX; d++) {
1862 			debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: \
1863 				   testing read d=%u\n", __func__, __LINE__, d);
1864 			scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
1865 
1866 			if (!rw_mgr_mem_calibrate_read_test_all_ranks
1867 				(grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
1868 				found_failing_read = 1;
1869 				break;
1870 			}
1871 		}
1872 	} else {
1873 		debug_cond(DLEVEL == 1, "%s:%d find_dqs_en_phase: failed to \
1874 			   calculate dtaps", __func__, __LINE__);
1875 		debug_cond(DLEVEL == 1, "per ptap. Fall back on static value\n");
1876 	}
1877 
1878 	/*
1879 	 * The dynamically calculated dtaps_per_ptap is only valid if we
1880 	 * found a passing/failing read. If we didn't, it means d hit the max
1881 	 * (IO_DQS_EN_DELAY_MAX). Otherwise, dtaps_per_ptap retains its
1882 	 * statically calculated value.
1883 	 */
1884 	if (found_passing_read && found_failing_read)
1885 		dtaps_per_ptap = d - initial_failing_dtap;
1886 
1887 	addr = (u32)&sdr_reg_file->dtaps_per_ptap;
1888 	writel(dtaps_per_ptap, addr);
1889 	debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: dtaps_per_ptap=%u \
1890 		   - %u = %u",  __func__, __LINE__, d,
1891 		   initial_failing_dtap, dtaps_per_ptap);
1892 
1893 	/* ******************************************** */
1894 	/* * step 6:  Find the centre of the window   * */
1895 	if (sdr_find_window_centre(&grp, &bit_chk, &work_bgn, &v, &d, &p,
1896 				   &work_mid, &work_end) == 0)
1897 		return 0;
1898 
1899 	debug_cond(DLEVEL == 2, "%s:%d find_dqs_en_phase: center found: \
1900 		   vfifo=%u ptap=%u dtap=%u\n", __func__, __LINE__,
1901 		   v, p-1, d);
1902 	return 1;
1903 }
1904 
1905 /*
1906  * Try rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase across different
1907  * dq_in_delay values
1908  */
1909 static uint32_t
1910 rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase_sweep_dq_in_delay
1911 (uint32_t write_group, uint32_t read_group, uint32_t test_bgn)
1912 {
1913 	uint32_t found;
1914 	uint32_t i;
1915 	uint32_t p;
1916 	uint32_t d;
1917 	uint32_t r;
1918 	uint32_t addr;
1919 
1920 	const uint32_t delay_step = IO_IO_IN_DELAY_MAX /
1921 		(RW_MGR_MEM_DQ_PER_READ_DQS-1);
1922 		/* we start at zero, so have one less dq to devide among */
1923 
1924 	debug("%s:%d (%u,%u,%u)", __func__, __LINE__, write_group, read_group,
1925 	      test_bgn);
1926 
1927 	/* try different dq_in_delays since the dq path is shorter than dqs */
1928 
1929 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS;
1930 	     r += NUM_RANKS_PER_SHADOW_REG) {
1931 		for (i = 0, p = test_bgn, d = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS;
1932 			i++, p++, d += delay_step) {
1933 			debug_cond(DLEVEL == 1, "%s:%d rw_mgr_mem_calibrate_\
1934 				   vfifo_find_dqs_", __func__, __LINE__);
1935 			debug_cond(DLEVEL == 1, "en_phase_sweep_dq_in_delay: g=%u/%u ",
1936 			       write_group, read_group);
1937 			debug_cond(DLEVEL == 1, "r=%u, i=%u p=%u d=%u\n", r, i , p, d);
1938 			scc_mgr_set_dq_in_delay(write_group, p, d);
1939 			scc_mgr_load_dq(p);
1940 		}
1941 		addr = (u32)&sdr_scc_mgr->update;
1942 		writel(0, addr);
1943 	}
1944 
1945 	found = rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase(read_group);
1946 
1947 	debug_cond(DLEVEL == 1, "%s:%d rw_mgr_mem_calibrate_vfifo_find_dqs_\
1948 		   en_phase_sweep_dq", __func__, __LINE__);
1949 	debug_cond(DLEVEL == 1, "_in_delay: g=%u/%u found=%u; Reseting delay \
1950 		   chain to zero\n", write_group, read_group, found);
1951 
1952 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS;
1953 	     r += NUM_RANKS_PER_SHADOW_REG) {
1954 		for (i = 0, p = test_bgn; i < RW_MGR_MEM_DQ_PER_READ_DQS;
1955 			i++, p++) {
1956 			scc_mgr_set_dq_in_delay(write_group, p, 0);
1957 			scc_mgr_load_dq(p);
1958 		}
1959 		addr = (u32)&sdr_scc_mgr->update;
1960 		writel(0, addr);
1961 	}
1962 
1963 	return found;
1964 }
1965 
1966 /* per-bit deskew DQ and center */
1967 static uint32_t rw_mgr_mem_calibrate_vfifo_center(uint32_t rank_bgn,
1968 	uint32_t write_group, uint32_t read_group, uint32_t test_bgn,
1969 	uint32_t use_read_test, uint32_t update_fom)
1970 {
1971 	uint32_t i, p, d, min_index;
1972 	/*
1973 	 * Store these as signed since there are comparisons with
1974 	 * signed numbers.
1975 	 */
1976 	uint32_t bit_chk;
1977 	uint32_t sticky_bit_chk;
1978 	int32_t left_edge[RW_MGR_MEM_DQ_PER_READ_DQS];
1979 	int32_t right_edge[RW_MGR_MEM_DQ_PER_READ_DQS];
1980 	int32_t final_dq[RW_MGR_MEM_DQ_PER_READ_DQS];
1981 	int32_t mid;
1982 	int32_t orig_mid_min, mid_min;
1983 	int32_t new_dqs, start_dqs, start_dqs_en, shift_dq, final_dqs,
1984 		final_dqs_en;
1985 	int32_t dq_margin, dqs_margin;
1986 	uint32_t stop;
1987 	uint32_t temp_dq_in_delay1, temp_dq_in_delay2;
1988 	uint32_t addr;
1989 
1990 	debug("%s:%d: %u %u", __func__, __LINE__, read_group, test_bgn);
1991 
1992 	addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_DQS_IN_DELAY_OFFSET;
1993 	start_dqs = readl(addr + (read_group << 2));
1994 	if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS)
1995 		start_dqs_en = readl(addr + ((read_group << 2)
1996 				     - IO_DQS_EN_DELAY_OFFSET));
1997 
1998 	/* set the left and right edge of each bit to an illegal value */
1999 	/* use (IO_IO_IN_DELAY_MAX + 1) as an illegal value */
2000 	sticky_bit_chk = 0;
2001 	for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
2002 		left_edge[i]  = IO_IO_IN_DELAY_MAX + 1;
2003 		right_edge[i] = IO_IO_IN_DELAY_MAX + 1;
2004 	}
2005 
2006 	addr = (u32)&sdr_scc_mgr->update;
2007 	/* Search for the left edge of the window for each bit */
2008 	for (d = 0; d <= IO_IO_IN_DELAY_MAX; d++) {
2009 		scc_mgr_apply_group_dq_in_delay(write_group, test_bgn, d);
2010 
2011 		writel(0, addr);
2012 
2013 		/*
2014 		 * Stop searching when the read test doesn't pass AND when
2015 		 * we've seen a passing read on every bit.
2016 		 */
2017 		if (use_read_test) {
2018 			stop = !rw_mgr_mem_calibrate_read_test(rank_bgn,
2019 				read_group, NUM_READ_PB_TESTS, PASS_ONE_BIT,
2020 				&bit_chk, 0, 0);
2021 		} else {
2022 			rw_mgr_mem_calibrate_write_test(rank_bgn, write_group,
2023 							0, PASS_ONE_BIT,
2024 							&bit_chk, 0);
2025 			bit_chk = bit_chk >> (RW_MGR_MEM_DQ_PER_READ_DQS *
2026 				(read_group - (write_group *
2027 					RW_MGR_MEM_IF_READ_DQS_WIDTH /
2028 					RW_MGR_MEM_IF_WRITE_DQS_WIDTH)));
2029 			stop = (bit_chk == 0);
2030 		}
2031 		sticky_bit_chk = sticky_bit_chk | bit_chk;
2032 		stop = stop && (sticky_bit_chk == param->read_correct_mask);
2033 		debug_cond(DLEVEL == 2, "%s:%d vfifo_center(left): dtap=%u => %u == %u \
2034 			   && %u", __func__, __LINE__, d,
2035 			   sticky_bit_chk,
2036 			param->read_correct_mask, stop);
2037 
2038 		if (stop == 1) {
2039 			break;
2040 		} else {
2041 			for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
2042 				if (bit_chk & 1) {
2043 					/* Remember a passing test as the
2044 					left_edge */
2045 					left_edge[i] = d;
2046 				} else {
2047 					/* If a left edge has not been seen yet,
2048 					then a future passing test will mark
2049 					this edge as the right edge */
2050 					if (left_edge[i] ==
2051 						IO_IO_IN_DELAY_MAX + 1) {
2052 						right_edge[i] = -(d + 1);
2053 					}
2054 				}
2055 				bit_chk = bit_chk >> 1;
2056 			}
2057 		}
2058 	}
2059 
2060 	/* Reset DQ delay chains to 0 */
2061 	scc_mgr_apply_group_dq_in_delay(write_group, test_bgn, 0);
2062 	sticky_bit_chk = 0;
2063 	for (i = RW_MGR_MEM_DQ_PER_READ_DQS - 1;; i--) {
2064 		debug_cond(DLEVEL == 2, "%s:%d vfifo_center: left_edge[%u]: \
2065 			   %d right_edge[%u]: %d\n", __func__, __LINE__,
2066 			   i, left_edge[i], i, right_edge[i]);
2067 
2068 		/*
2069 		 * Check for cases where we haven't found the left edge,
2070 		 * which makes our assignment of the the right edge invalid.
2071 		 * Reset it to the illegal value.
2072 		 */
2073 		if ((left_edge[i] == IO_IO_IN_DELAY_MAX + 1) && (
2074 			right_edge[i] != IO_IO_IN_DELAY_MAX + 1)) {
2075 			right_edge[i] = IO_IO_IN_DELAY_MAX + 1;
2076 			debug_cond(DLEVEL == 2, "%s:%d vfifo_center: reset \
2077 				   right_edge[%u]: %d\n", __func__, __LINE__,
2078 				   i, right_edge[i]);
2079 		}
2080 
2081 		/*
2082 		 * Reset sticky bit (except for bits where we have seen
2083 		 * both the left and right edge).
2084 		 */
2085 		sticky_bit_chk = sticky_bit_chk << 1;
2086 		if ((left_edge[i] != IO_IO_IN_DELAY_MAX + 1) &&
2087 		    (right_edge[i] != IO_IO_IN_DELAY_MAX + 1)) {
2088 			sticky_bit_chk = sticky_bit_chk | 1;
2089 		}
2090 
2091 		if (i == 0)
2092 			break;
2093 	}
2094 
2095 	addr = (u32)&sdr_scc_mgr->update;
2096 	/* Search for the right edge of the window for each bit */
2097 	for (d = 0; d <= IO_DQS_IN_DELAY_MAX - start_dqs; d++) {
2098 		scc_mgr_set_dqs_bus_in_delay(read_group, d + start_dqs);
2099 		if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS) {
2100 			uint32_t delay = d + start_dqs_en;
2101 			if (delay > IO_DQS_EN_DELAY_MAX)
2102 				delay = IO_DQS_EN_DELAY_MAX;
2103 			scc_mgr_set_dqs_en_delay(read_group, delay);
2104 		}
2105 		scc_mgr_load_dqs(read_group);
2106 
2107 		writel(0, addr);
2108 
2109 		/*
2110 		 * Stop searching when the read test doesn't pass AND when
2111 		 * we've seen a passing read on every bit.
2112 		 */
2113 		if (use_read_test) {
2114 			stop = !rw_mgr_mem_calibrate_read_test(rank_bgn,
2115 				read_group, NUM_READ_PB_TESTS, PASS_ONE_BIT,
2116 				&bit_chk, 0, 0);
2117 		} else {
2118 			rw_mgr_mem_calibrate_write_test(rank_bgn, write_group,
2119 							0, PASS_ONE_BIT,
2120 							&bit_chk, 0);
2121 			bit_chk = bit_chk >> (RW_MGR_MEM_DQ_PER_READ_DQS *
2122 				(read_group - (write_group *
2123 					RW_MGR_MEM_IF_READ_DQS_WIDTH /
2124 					RW_MGR_MEM_IF_WRITE_DQS_WIDTH)));
2125 			stop = (bit_chk == 0);
2126 		}
2127 		sticky_bit_chk = sticky_bit_chk | bit_chk;
2128 		stop = stop && (sticky_bit_chk == param->read_correct_mask);
2129 
2130 		debug_cond(DLEVEL == 2, "%s:%d vfifo_center(right): dtap=%u => %u == \
2131 			   %u && %u", __func__, __LINE__, d,
2132 			   sticky_bit_chk, param->read_correct_mask, stop);
2133 
2134 		if (stop == 1) {
2135 			break;
2136 		} else {
2137 			for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
2138 				if (bit_chk & 1) {
2139 					/* Remember a passing test as
2140 					the right_edge */
2141 					right_edge[i] = d;
2142 				} else {
2143 					if (d != 0) {
2144 						/* If a right edge has not been
2145 						seen yet, then a future passing
2146 						test will mark this edge as the
2147 						left edge */
2148 						if (right_edge[i] ==
2149 						IO_IO_IN_DELAY_MAX + 1) {
2150 							left_edge[i] = -(d + 1);
2151 						}
2152 					} else {
2153 						/* d = 0 failed, but it passed
2154 						when testing the left edge,
2155 						so it must be marginal,
2156 						set it to -1 */
2157 						if (right_edge[i] ==
2158 							IO_IO_IN_DELAY_MAX + 1 &&
2159 							left_edge[i] !=
2160 							IO_IO_IN_DELAY_MAX
2161 							+ 1) {
2162 							right_edge[i] = -1;
2163 						}
2164 						/* If a right edge has not been
2165 						seen yet, then a future passing
2166 						test will mark this edge as the
2167 						left edge */
2168 						else if (right_edge[i] ==
2169 							IO_IO_IN_DELAY_MAX +
2170 							1) {
2171 							left_edge[i] = -(d + 1);
2172 						}
2173 					}
2174 				}
2175 
2176 				debug_cond(DLEVEL == 2, "%s:%d vfifo_center[r,\
2177 					   d=%u]: ", __func__, __LINE__, d);
2178 				debug_cond(DLEVEL == 2, "bit_chk_test=%d left_edge[%u]: %d ",
2179 					   (int)(bit_chk & 1), i, left_edge[i]);
2180 				debug_cond(DLEVEL == 2, "right_edge[%u]: %d\n", i,
2181 					   right_edge[i]);
2182 				bit_chk = bit_chk >> 1;
2183 			}
2184 		}
2185 	}
2186 
2187 	/* Check that all bits have a window */
2188 	addr = (u32)&sdr_scc_mgr->update;
2189 	for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
2190 		debug_cond(DLEVEL == 2, "%s:%d vfifo_center: left_edge[%u]: \
2191 			   %d right_edge[%u]: %d", __func__, __LINE__,
2192 			   i, left_edge[i], i, right_edge[i]);
2193 		if ((left_edge[i] == IO_IO_IN_DELAY_MAX + 1) || (right_edge[i]
2194 			== IO_IO_IN_DELAY_MAX + 1)) {
2195 			/*
2196 			 * Restore delay chain settings before letting the loop
2197 			 * in rw_mgr_mem_calibrate_vfifo to retry different
2198 			 * dqs/ck relationships.
2199 			 */
2200 			scc_mgr_set_dqs_bus_in_delay(read_group, start_dqs);
2201 			if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS) {
2202 				scc_mgr_set_dqs_en_delay(read_group,
2203 							 start_dqs_en);
2204 			}
2205 			scc_mgr_load_dqs(read_group);
2206 			writel(0, addr);
2207 
2208 			debug_cond(DLEVEL == 1, "%s:%d vfifo_center: failed to \
2209 				   find edge [%u]: %d %d", __func__, __LINE__,
2210 				   i, left_edge[i], right_edge[i]);
2211 			if (use_read_test) {
2212 				set_failing_group_stage(read_group *
2213 					RW_MGR_MEM_DQ_PER_READ_DQS + i,
2214 					CAL_STAGE_VFIFO,
2215 					CAL_SUBSTAGE_VFIFO_CENTER);
2216 			} else {
2217 				set_failing_group_stage(read_group *
2218 					RW_MGR_MEM_DQ_PER_READ_DQS + i,
2219 					CAL_STAGE_VFIFO_AFTER_WRITES,
2220 					CAL_SUBSTAGE_VFIFO_CENTER);
2221 			}
2222 			return 0;
2223 		}
2224 	}
2225 
2226 	/* Find middle of window for each DQ bit */
2227 	mid_min = left_edge[0] - right_edge[0];
2228 	min_index = 0;
2229 	for (i = 1; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
2230 		mid = left_edge[i] - right_edge[i];
2231 		if (mid < mid_min) {
2232 			mid_min = mid;
2233 			min_index = i;
2234 		}
2235 	}
2236 
2237 	/*
2238 	 * -mid_min/2 represents the amount that we need to move DQS.
2239 	 * If mid_min is odd and positive we'll need to add one to
2240 	 * make sure the rounding in further calculations is correct
2241 	 * (always bias to the right), so just add 1 for all positive values.
2242 	 */
2243 	if (mid_min > 0)
2244 		mid_min++;
2245 
2246 	mid_min = mid_min / 2;
2247 
2248 	debug_cond(DLEVEL == 1, "%s:%d vfifo_center: mid_min=%d (index=%u)\n",
2249 		   __func__, __LINE__, mid_min, min_index);
2250 
2251 	/* Determine the amount we can change DQS (which is -mid_min) */
2252 	orig_mid_min = mid_min;
2253 	new_dqs = start_dqs - mid_min;
2254 	if (new_dqs > IO_DQS_IN_DELAY_MAX)
2255 		new_dqs = IO_DQS_IN_DELAY_MAX;
2256 	else if (new_dqs < 0)
2257 		new_dqs = 0;
2258 
2259 	mid_min = start_dqs - new_dqs;
2260 	debug_cond(DLEVEL == 1, "vfifo_center: new mid_min=%d new_dqs=%d\n",
2261 		   mid_min, new_dqs);
2262 
2263 	if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS) {
2264 		if (start_dqs_en - mid_min > IO_DQS_EN_DELAY_MAX)
2265 			mid_min += start_dqs_en - mid_min - IO_DQS_EN_DELAY_MAX;
2266 		else if (start_dqs_en - mid_min < 0)
2267 			mid_min += start_dqs_en - mid_min;
2268 	}
2269 	new_dqs = start_dqs - mid_min;
2270 
2271 	debug_cond(DLEVEL == 1, "vfifo_center: start_dqs=%d start_dqs_en=%d \
2272 		   new_dqs=%d mid_min=%d\n", start_dqs,
2273 		   IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS ? start_dqs_en : -1,
2274 		   new_dqs, mid_min);
2275 
2276 	/* Initialize data for export structures */
2277 	dqs_margin = IO_IO_IN_DELAY_MAX + 1;
2278 	dq_margin  = IO_IO_IN_DELAY_MAX + 1;
2279 
2280 	addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_IO_IN_DELAY_OFFSET;
2281 	/* add delay to bring centre of all DQ windows to the same "level" */
2282 	for (i = 0, p = test_bgn; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++, p++) {
2283 		/* Use values before divide by 2 to reduce round off error */
2284 		shift_dq = (left_edge[i] - right_edge[i] -
2285 			(left_edge[min_index] - right_edge[min_index]))/2  +
2286 			(orig_mid_min - mid_min);
2287 
2288 		debug_cond(DLEVEL == 2, "vfifo_center: before: \
2289 			   shift_dq[%u]=%d\n", i, shift_dq);
2290 
2291 		temp_dq_in_delay1 = readl(addr + (p << 2));
2292 		temp_dq_in_delay2 = readl(addr + (i << 2));
2293 
2294 		if (shift_dq + (int32_t)temp_dq_in_delay1 >
2295 			(int32_t)IO_IO_IN_DELAY_MAX) {
2296 			shift_dq = (int32_t)IO_IO_IN_DELAY_MAX - temp_dq_in_delay2;
2297 		} else if (shift_dq + (int32_t)temp_dq_in_delay1 < 0) {
2298 			shift_dq = -(int32_t)temp_dq_in_delay1;
2299 		}
2300 		debug_cond(DLEVEL == 2, "vfifo_center: after: \
2301 			   shift_dq[%u]=%d\n", i, shift_dq);
2302 		final_dq[i] = temp_dq_in_delay1 + shift_dq;
2303 		scc_mgr_set_dq_in_delay(write_group, p, final_dq[i]);
2304 		scc_mgr_load_dq(p);
2305 
2306 		debug_cond(DLEVEL == 2, "vfifo_center: margin[%u]=[%d,%d]\n", i,
2307 			   left_edge[i] - shift_dq + (-mid_min),
2308 			   right_edge[i] + shift_dq - (-mid_min));
2309 		/* To determine values for export structures */
2310 		if (left_edge[i] - shift_dq + (-mid_min) < dq_margin)
2311 			dq_margin = left_edge[i] - shift_dq + (-mid_min);
2312 
2313 		if (right_edge[i] + shift_dq - (-mid_min) < dqs_margin)
2314 			dqs_margin = right_edge[i] + shift_dq - (-mid_min);
2315 	}
2316 
2317 	final_dqs = new_dqs;
2318 	if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS)
2319 		final_dqs_en = start_dqs_en - mid_min;
2320 
2321 	/* Move DQS-en */
2322 	if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS) {
2323 		scc_mgr_set_dqs_en_delay(read_group, final_dqs_en);
2324 		scc_mgr_load_dqs(read_group);
2325 	}
2326 
2327 	/* Move DQS */
2328 	scc_mgr_set_dqs_bus_in_delay(read_group, final_dqs);
2329 	scc_mgr_load_dqs(read_group);
2330 	debug_cond(DLEVEL == 2, "%s:%d vfifo_center: dq_margin=%d \
2331 		   dqs_margin=%d", __func__, __LINE__,
2332 		   dq_margin, dqs_margin);
2333 
2334 	/*
2335 	 * Do not remove this line as it makes sure all of our decisions
2336 	 * have been applied. Apply the update bit.
2337 	 */
2338 	addr = (u32)&sdr_scc_mgr->update;
2339 	writel(0, addr);
2340 
2341 	return (dq_margin >= 0) && (dqs_margin >= 0);
2342 }
2343 
2344 /*
2345  * calibrate the read valid prediction FIFO.
2346  *
2347  *  - read valid prediction will consist of finding a good DQS enable phase,
2348  * DQS enable delay, DQS input phase, and DQS input delay.
2349  *  - we also do a per-bit deskew on the DQ lines.
2350  */
2351 static uint32_t rw_mgr_mem_calibrate_vfifo(uint32_t read_group,
2352 					   uint32_t test_bgn)
2353 {
2354 	uint32_t p, d, rank_bgn, sr;
2355 	uint32_t dtaps_per_ptap;
2356 	uint32_t tmp_delay;
2357 	uint32_t bit_chk;
2358 	uint32_t grp_calibrated;
2359 	uint32_t write_group, write_test_bgn;
2360 	uint32_t failed_substage;
2361 
2362 	debug("%s:%d: %u %u\n", __func__, __LINE__, read_group, test_bgn);
2363 
2364 	/* update info for sims */
2365 	reg_file_set_stage(CAL_STAGE_VFIFO);
2366 
2367 	write_group = read_group;
2368 	write_test_bgn = test_bgn;
2369 
2370 	/* USER Determine number of delay taps for each phase tap */
2371 	dtaps_per_ptap = 0;
2372 	tmp_delay = 0;
2373 	while (tmp_delay < IO_DELAY_PER_OPA_TAP) {
2374 		dtaps_per_ptap++;
2375 		tmp_delay += IO_DELAY_PER_DQS_EN_DCHAIN_TAP;
2376 	}
2377 	dtaps_per_ptap--;
2378 	tmp_delay = 0;
2379 
2380 	/* update info for sims */
2381 	reg_file_set_group(read_group);
2382 
2383 	grp_calibrated = 0;
2384 
2385 	reg_file_set_sub_stage(CAL_SUBSTAGE_GUARANTEED_READ);
2386 	failed_substage = CAL_SUBSTAGE_GUARANTEED_READ;
2387 
2388 	for (d = 0; d <= dtaps_per_ptap && grp_calibrated == 0; d += 2) {
2389 		/*
2390 		 * In RLDRAMX we may be messing the delay of pins in
2391 		 * the same write group but outside of the current read
2392 		 * the group, but that's ok because we haven't
2393 		 * calibrated output side yet.
2394 		 */
2395 		if (d > 0) {
2396 			scc_mgr_apply_group_all_out_delay_add_all_ranks
2397 			(write_group, write_test_bgn, d);
2398 		}
2399 
2400 		for (p = 0; p <= IO_DQDQS_OUT_PHASE_MAX && grp_calibrated == 0;
2401 			p++) {
2402 			/* set a particular dqdqs phase */
2403 			scc_mgr_set_dqdqs_output_phase_all_ranks(read_group, p);
2404 
2405 			debug_cond(DLEVEL == 1, "%s:%d calibrate_vfifo: g=%u \
2406 				   p=%u d=%u\n", __func__, __LINE__,
2407 				   read_group, p, d);
2408 
2409 			/*
2410 			 * Load up the patterns used by read calibration
2411 			 * using current DQDQS phase.
2412 			 */
2413 			rw_mgr_mem_calibrate_read_load_patterns(0, 1);
2414 			if (!(gbl->phy_debug_mode_flags &
2415 				PHY_DEBUG_DISABLE_GUARANTEED_READ)) {
2416 				if (!rw_mgr_mem_calibrate_read_test_patterns_all_ranks
2417 				    (read_group, 1, &bit_chk)) {
2418 					debug_cond(DLEVEL == 1, "%s:%d Guaranteed read test failed:",
2419 						   __func__, __LINE__);
2420 					debug_cond(DLEVEL == 1, " g=%u p=%u d=%u\n",
2421 						   read_group, p, d);
2422 					break;
2423 				}
2424 			}
2425 
2426 /* case:56390 */
2427 			grp_calibrated = 1;
2428 		if (rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase_sweep_dq_in_delay
2429 		    (write_group, read_group, test_bgn)) {
2430 				/*
2431 				 * USER Read per-bit deskew can be done on a
2432 				 * per shadow register basis.
2433 				 */
2434 				for (rank_bgn = 0, sr = 0;
2435 					rank_bgn < RW_MGR_MEM_NUMBER_OF_RANKS;
2436 					rank_bgn += NUM_RANKS_PER_SHADOW_REG,
2437 					++sr) {
2438 					/*
2439 					 * Determine if this set of ranks
2440 					 * should be skipped entirely.
2441 					 */
2442 					if (!param->skip_shadow_regs[sr]) {
2443 						/*
2444 						 * If doing read after write
2445 						 * calibration, do not update
2446 						 * FOM, now - do it then.
2447 						 */
2448 					if (!rw_mgr_mem_calibrate_vfifo_center
2449 						(rank_bgn, write_group,
2450 						read_group, test_bgn, 1, 0)) {
2451 							grp_calibrated = 0;
2452 							failed_substage =
2453 						CAL_SUBSTAGE_VFIFO_CENTER;
2454 						}
2455 					}
2456 				}
2457 			} else {
2458 				grp_calibrated = 0;
2459 				failed_substage = CAL_SUBSTAGE_DQS_EN_PHASE;
2460 			}
2461 		}
2462 	}
2463 
2464 	if (grp_calibrated == 0) {
2465 		set_failing_group_stage(write_group, CAL_STAGE_VFIFO,
2466 					failed_substage);
2467 		return 0;
2468 	}
2469 
2470 	/*
2471 	 * Reset the delay chains back to zero if they have moved > 1
2472 	 * (check for > 1 because loop will increase d even when pass in
2473 	 * first case).
2474 	 */
2475 	if (d > 2)
2476 		scc_mgr_zero_group(write_group, write_test_bgn, 1);
2477 
2478 	return 1;
2479 }
2480 
2481 /* VFIFO Calibration -- Read Deskew Calibration after write deskew */
2482 static uint32_t rw_mgr_mem_calibrate_vfifo_end(uint32_t read_group,
2483 					       uint32_t test_bgn)
2484 {
2485 	uint32_t rank_bgn, sr;
2486 	uint32_t grp_calibrated;
2487 	uint32_t write_group;
2488 
2489 	debug("%s:%d %u %u", __func__, __LINE__, read_group, test_bgn);
2490 
2491 	/* update info for sims */
2492 
2493 	reg_file_set_stage(CAL_STAGE_VFIFO_AFTER_WRITES);
2494 	reg_file_set_sub_stage(CAL_SUBSTAGE_VFIFO_CENTER);
2495 
2496 	write_group = read_group;
2497 
2498 	/* update info for sims */
2499 	reg_file_set_group(read_group);
2500 
2501 	grp_calibrated = 1;
2502 	/* Read per-bit deskew can be done on a per shadow register basis */
2503 	for (rank_bgn = 0, sr = 0; rank_bgn < RW_MGR_MEM_NUMBER_OF_RANKS;
2504 		rank_bgn += NUM_RANKS_PER_SHADOW_REG, ++sr) {
2505 		/* Determine if this set of ranks should be skipped entirely */
2506 		if (!param->skip_shadow_regs[sr]) {
2507 		/* This is the last calibration round, update FOM here */
2508 			if (!rw_mgr_mem_calibrate_vfifo_center(rank_bgn,
2509 								write_group,
2510 								read_group,
2511 								test_bgn, 0,
2512 								1)) {
2513 				grp_calibrated = 0;
2514 			}
2515 		}
2516 	}
2517 
2518 
2519 	if (grp_calibrated == 0) {
2520 		set_failing_group_stage(write_group,
2521 					CAL_STAGE_VFIFO_AFTER_WRITES,
2522 					CAL_SUBSTAGE_VFIFO_CENTER);
2523 		return 0;
2524 	}
2525 
2526 	return 1;
2527 }
2528 
2529 /* Calibrate LFIFO to find smallest read latency */
2530 static uint32_t rw_mgr_mem_calibrate_lfifo(void)
2531 {
2532 	uint32_t found_one;
2533 	uint32_t bit_chk;
2534 	uint32_t addr;
2535 
2536 	debug("%s:%d\n", __func__, __LINE__);
2537 
2538 	/* update info for sims */
2539 	reg_file_set_stage(CAL_STAGE_LFIFO);
2540 	reg_file_set_sub_stage(CAL_SUBSTAGE_READ_LATENCY);
2541 
2542 	/* Load up the patterns used by read calibration for all ranks */
2543 	rw_mgr_mem_calibrate_read_load_patterns(0, 1);
2544 	found_one = 0;
2545 
2546 	addr = (u32)&phy_mgr_cfg->phy_rlat;
2547 	do {
2548 		writel(gbl->curr_read_lat, addr);
2549 		debug_cond(DLEVEL == 2, "%s:%d lfifo: read_lat=%u",
2550 			   __func__, __LINE__, gbl->curr_read_lat);
2551 
2552 		if (!rw_mgr_mem_calibrate_read_test_all_ranks(0,
2553 							      NUM_READ_TESTS,
2554 							      PASS_ALL_BITS,
2555 							      &bit_chk, 1)) {
2556 			break;
2557 		}
2558 
2559 		found_one = 1;
2560 		/* reduce read latency and see if things are working */
2561 		/* correctly */
2562 		gbl->curr_read_lat--;
2563 	} while (gbl->curr_read_lat > 0);
2564 
2565 	/* reset the fifos to get pointers to known state */
2566 
2567 	addr = (u32)&phy_mgr_cmd->fifo_reset;
2568 	writel(0, addr);
2569 
2570 	if (found_one) {
2571 		/* add a fudge factor to the read latency that was determined */
2572 		gbl->curr_read_lat += 2;
2573 		addr = (u32)&phy_mgr_cfg->phy_rlat;
2574 		writel(gbl->curr_read_lat, addr);
2575 		debug_cond(DLEVEL == 2, "%s:%d lfifo: success: using \
2576 			   read_lat=%u\n", __func__, __LINE__,
2577 			   gbl->curr_read_lat);
2578 		return 1;
2579 	} else {
2580 		set_failing_group_stage(0xff, CAL_STAGE_LFIFO,
2581 					CAL_SUBSTAGE_READ_LATENCY);
2582 
2583 		debug_cond(DLEVEL == 2, "%s:%d lfifo: failed at initial \
2584 			   read_lat=%u\n", __func__, __LINE__,
2585 			   gbl->curr_read_lat);
2586 		return 0;
2587 	}
2588 }
2589 
2590 /*
2591  * issue write test command.
2592  * two variants are provided. one that just tests a write pattern and
2593  * another that tests datamask functionality.
2594  */
2595 static void rw_mgr_mem_calibrate_write_test_issue(uint32_t group,
2596 						  uint32_t test_dm)
2597 {
2598 	uint32_t mcc_instruction;
2599 	uint32_t quick_write_mode = (((STATIC_CALIB_STEPS) & CALIB_SKIP_WRITES) &&
2600 		ENABLE_SUPER_QUICK_CALIBRATION);
2601 	uint32_t rw_wl_nop_cycles;
2602 	uint32_t addr;
2603 
2604 	/*
2605 	 * Set counter and jump addresses for the right
2606 	 * number of NOP cycles.
2607 	 * The number of supported NOP cycles can range from -1 to infinity
2608 	 * Three different cases are handled:
2609 	 *
2610 	 * 1. For a number of NOP cycles greater than 0, the RW Mgr looping
2611 	 *    mechanism will be used to insert the right number of NOPs
2612 	 *
2613 	 * 2. For a number of NOP cycles equals to 0, the micro-instruction
2614 	 *    issuing the write command will jump straight to the
2615 	 *    micro-instruction that turns on DQS (for DDRx), or outputs write
2616 	 *    data (for RLD), skipping
2617 	 *    the NOP micro-instruction all together
2618 	 *
2619 	 * 3. A number of NOP cycles equal to -1 indicates that DQS must be
2620 	 *    turned on in the same micro-instruction that issues the write
2621 	 *    command. Then we need
2622 	 *    to directly jump to the micro-instruction that sends out the data
2623 	 *
2624 	 * NOTE: Implementing this mechanism uses 2 RW Mgr jump-counters
2625 	 *       (2 and 3). One jump-counter (0) is used to perform multiple
2626 	 *       write-read operations.
2627 	 *       one counter left to issue this command in "multiple-group" mode
2628 	 */
2629 
2630 	rw_wl_nop_cycles = gbl->rw_wl_nop_cycles;
2631 
2632 	if (rw_wl_nop_cycles == -1) {
2633 		/*
2634 		 * CNTR 2 - We want to execute the special write operation that
2635 		 * turns on DQS right away and then skip directly to the
2636 		 * instruction that sends out the data. We set the counter to a
2637 		 * large number so that the jump is always taken.
2638 		 */
2639 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr2;
2640 		writel(0xFF, addr);
2641 
2642 		/* CNTR 3 - Not used */
2643 		if (test_dm) {
2644 			mcc_instruction = RW_MGR_LFSR_WR_RD_DM_BANK_0_WL_1;
2645 			addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add2;
2646 			writel(RW_MGR_LFSR_WR_RD_DM_BANK_0_DATA,
2647 			       addr);
2648 			addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add3;
2649 			writel(RW_MGR_LFSR_WR_RD_DM_BANK_0_NOP,
2650 			       addr);
2651 		} else {
2652 			mcc_instruction = RW_MGR_LFSR_WR_RD_BANK_0_WL_1;
2653 			addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add2;
2654 			writel(RW_MGR_LFSR_WR_RD_BANK_0_DATA, addr);
2655 			addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add3;
2656 			writel(RW_MGR_LFSR_WR_RD_BANK_0_NOP, addr);
2657 		}
2658 	} else if (rw_wl_nop_cycles == 0) {
2659 		/*
2660 		 * CNTR 2 - We want to skip the NOP operation and go straight
2661 		 * to the DQS enable instruction. We set the counter to a large
2662 		 * number so that the jump is always taken.
2663 		 */
2664 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr2;
2665 		writel(0xFF, addr);
2666 
2667 		/* CNTR 3 - Not used */
2668 		if (test_dm) {
2669 			mcc_instruction = RW_MGR_LFSR_WR_RD_DM_BANK_0;
2670 			addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add2;
2671 			writel(RW_MGR_LFSR_WR_RD_DM_BANK_0_DQS,
2672 			       addr);
2673 		} else {
2674 			mcc_instruction = RW_MGR_LFSR_WR_RD_BANK_0;
2675 			addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add2;
2676 			writel(RW_MGR_LFSR_WR_RD_BANK_0_DQS, addr);
2677 		}
2678 	} else {
2679 		/*
2680 		 * CNTR 2 - In this case we want to execute the next instruction
2681 		 * and NOT take the jump. So we set the counter to 0. The jump
2682 		 * address doesn't count.
2683 		 */
2684 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr2;
2685 		writel(0x0, addr);
2686 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add2;
2687 		writel(0x0, addr);
2688 
2689 		/*
2690 		 * CNTR 3 - Set the nop counter to the number of cycles we
2691 		 * need to loop for, minus 1.
2692 		 */
2693 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr3;
2694 		writel(rw_wl_nop_cycles - 1, addr);
2695 		if (test_dm) {
2696 			mcc_instruction = RW_MGR_LFSR_WR_RD_DM_BANK_0;
2697 			addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add3;
2698 			writel(RW_MGR_LFSR_WR_RD_DM_BANK_0_NOP, addr);
2699 		} else {
2700 			mcc_instruction = RW_MGR_LFSR_WR_RD_BANK_0;
2701 			addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add3;
2702 			writel(RW_MGR_LFSR_WR_RD_BANK_0_NOP, addr);
2703 		}
2704 	}
2705 
2706 	addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RESET_READ_DATAPATH_OFFSET;
2707 	writel(0, addr);
2708 
2709 	addr = (u32)&sdr_rw_load_mgr_regs->load_cntr0;
2710 	if (quick_write_mode)
2711 		writel(0x08, addr);
2712 	else
2713 		writel(0x40, addr);
2714 
2715 	addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add0;
2716 	writel(mcc_instruction, addr);
2717 
2718 	/*
2719 	 * CNTR 1 - This is used to ensure enough time elapses
2720 	 * for read data to come back.
2721 	 */
2722 	addr = (u32)&sdr_rw_load_mgr_regs->load_cntr1;
2723 	writel(0x30, addr);
2724 
2725 	addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add1;
2726 	if (test_dm) {
2727 		writel(RW_MGR_LFSR_WR_RD_DM_BANK_0_WAIT, addr);
2728 	} else {
2729 		writel(RW_MGR_LFSR_WR_RD_BANK_0_WAIT, addr);
2730 	}
2731 
2732 	addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
2733 	writel(mcc_instruction, addr + (group << 2));
2734 }
2735 
2736 /* Test writes, can check for a single bit pass or multiple bit pass */
2737 static uint32_t rw_mgr_mem_calibrate_write_test(uint32_t rank_bgn,
2738 	uint32_t write_group, uint32_t use_dm, uint32_t all_correct,
2739 	uint32_t *bit_chk, uint32_t all_ranks)
2740 {
2741 	uint32_t addr;
2742 	uint32_t r;
2743 	uint32_t correct_mask_vg;
2744 	uint32_t tmp_bit_chk;
2745 	uint32_t vg;
2746 	uint32_t rank_end = all_ranks ? RW_MGR_MEM_NUMBER_OF_RANKS :
2747 		(rank_bgn + NUM_RANKS_PER_SHADOW_REG);
2748 	uint32_t addr_rw_mgr;
2749 	uint32_t base_rw_mgr;
2750 
2751 	*bit_chk = param->write_correct_mask;
2752 	correct_mask_vg = param->write_correct_mask_vg;
2753 
2754 	for (r = rank_bgn; r < rank_end; r++) {
2755 		if (param->skip_ranks[r]) {
2756 			/* request to skip the rank */
2757 			continue;
2758 		}
2759 
2760 		/* set rank */
2761 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_READ_WRITE);
2762 
2763 		tmp_bit_chk = 0;
2764 		addr = (u32)&phy_mgr_cmd->fifo_reset;
2765 		addr_rw_mgr = SDR_PHYGRP_RWMGRGRP_ADDRESS;
2766 		for (vg = RW_MGR_MEM_VIRTUAL_GROUPS_PER_WRITE_DQS-1; ; vg--) {
2767 			/* reset the fifos to get pointers to known state */
2768 			writel(0, addr);
2769 
2770 			tmp_bit_chk = tmp_bit_chk <<
2771 				(RW_MGR_MEM_DQ_PER_WRITE_DQS /
2772 				RW_MGR_MEM_VIRTUAL_GROUPS_PER_WRITE_DQS);
2773 			rw_mgr_mem_calibrate_write_test_issue(write_group *
2774 				RW_MGR_MEM_VIRTUAL_GROUPS_PER_WRITE_DQS+vg,
2775 				use_dm);
2776 
2777 			base_rw_mgr = readl(addr_rw_mgr);
2778 			tmp_bit_chk = tmp_bit_chk | (correct_mask_vg & ~(base_rw_mgr));
2779 			if (vg == 0)
2780 				break;
2781 		}
2782 		*bit_chk &= tmp_bit_chk;
2783 	}
2784 
2785 	if (all_correct) {
2786 		set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
2787 		debug_cond(DLEVEL == 2, "write_test(%u,%u,ALL) : %u == \
2788 			   %u => %lu", write_group, use_dm,
2789 			   *bit_chk, param->write_correct_mask,
2790 			   (long unsigned int)(*bit_chk ==
2791 			   param->write_correct_mask));
2792 		return *bit_chk == param->write_correct_mask;
2793 	} else {
2794 		set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
2795 		debug_cond(DLEVEL == 2, "write_test(%u,%u,ONE) : %u != ",
2796 		       write_group, use_dm, *bit_chk);
2797 		debug_cond(DLEVEL == 2, "%lu" " => %lu", (long unsigned int)0,
2798 			(long unsigned int)(*bit_chk != 0));
2799 		return *bit_chk != 0x00;
2800 	}
2801 }
2802 
2803 /*
2804  * center all windows. do per-bit-deskew to possibly increase size of
2805  * certain windows.
2806  */
2807 static uint32_t rw_mgr_mem_calibrate_writes_center(uint32_t rank_bgn,
2808 	uint32_t write_group, uint32_t test_bgn)
2809 {
2810 	uint32_t i, p, min_index;
2811 	int32_t d;
2812 	/*
2813 	 * Store these as signed since there are comparisons with
2814 	 * signed numbers.
2815 	 */
2816 	uint32_t bit_chk;
2817 	uint32_t sticky_bit_chk;
2818 	int32_t left_edge[RW_MGR_MEM_DQ_PER_WRITE_DQS];
2819 	int32_t right_edge[RW_MGR_MEM_DQ_PER_WRITE_DQS];
2820 	int32_t mid;
2821 	int32_t mid_min, orig_mid_min;
2822 	int32_t new_dqs, start_dqs, shift_dq;
2823 	int32_t dq_margin, dqs_margin, dm_margin;
2824 	uint32_t stop;
2825 	uint32_t temp_dq_out1_delay;
2826 	uint32_t addr;
2827 
2828 	debug("%s:%d %u %u", __func__, __LINE__, write_group, test_bgn);
2829 
2830 	dm_margin = 0;
2831 
2832 	addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_IO_OUT1_DELAY_OFFSET;
2833 	start_dqs = readl(addr +
2834 			  (RW_MGR_MEM_DQ_PER_WRITE_DQS << 2));
2835 
2836 	/* per-bit deskew */
2837 
2838 	/*
2839 	 * set the left and right edge of each bit to an illegal value
2840 	 * use (IO_IO_OUT1_DELAY_MAX + 1) as an illegal value.
2841 	 */
2842 	sticky_bit_chk = 0;
2843 	for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
2844 		left_edge[i]  = IO_IO_OUT1_DELAY_MAX + 1;
2845 		right_edge[i] = IO_IO_OUT1_DELAY_MAX + 1;
2846 	}
2847 
2848 	/* Search for the left edge of the window for each bit */
2849 	addr = (u32)&sdr_scc_mgr->update;
2850 	for (d = 0; d <= IO_IO_OUT1_DELAY_MAX; d++) {
2851 		scc_mgr_apply_group_dq_out1_delay(write_group, test_bgn, d);
2852 
2853 		writel(0, addr);
2854 
2855 		/*
2856 		 * Stop searching when the read test doesn't pass AND when
2857 		 * we've seen a passing read on every bit.
2858 		 */
2859 		stop = !rw_mgr_mem_calibrate_write_test(rank_bgn, write_group,
2860 			0, PASS_ONE_BIT, &bit_chk, 0);
2861 		sticky_bit_chk = sticky_bit_chk | bit_chk;
2862 		stop = stop && (sticky_bit_chk == param->write_correct_mask);
2863 		debug_cond(DLEVEL == 2, "write_center(left): dtap=%d => %u \
2864 			   == %u && %u [bit_chk= %u ]\n",
2865 			d, sticky_bit_chk, param->write_correct_mask,
2866 			stop, bit_chk);
2867 
2868 		if (stop == 1) {
2869 			break;
2870 		} else {
2871 			for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
2872 				if (bit_chk & 1) {
2873 					/*
2874 					 * Remember a passing test as the
2875 					 * left_edge.
2876 					 */
2877 					left_edge[i] = d;
2878 				} else {
2879 					/*
2880 					 * If a left edge has not been seen
2881 					 * yet, then a future passing test will
2882 					 * mark this edge as the right edge.
2883 					 */
2884 					if (left_edge[i] ==
2885 						IO_IO_OUT1_DELAY_MAX + 1) {
2886 						right_edge[i] = -(d + 1);
2887 					}
2888 				}
2889 				debug_cond(DLEVEL == 2, "write_center[l,d=%d):", d);
2890 				debug_cond(DLEVEL == 2, "bit_chk_test=%d left_edge[%u]: %d",
2891 					   (int)(bit_chk & 1), i, left_edge[i]);
2892 				debug_cond(DLEVEL == 2, "right_edge[%u]: %d\n", i,
2893 				       right_edge[i]);
2894 				bit_chk = bit_chk >> 1;
2895 			}
2896 		}
2897 	}
2898 
2899 	/* Reset DQ delay chains to 0 */
2900 	scc_mgr_apply_group_dq_out1_delay(write_group, test_bgn, 0);
2901 	sticky_bit_chk = 0;
2902 	for (i = RW_MGR_MEM_DQ_PER_WRITE_DQS - 1;; i--) {
2903 		debug_cond(DLEVEL == 2, "%s:%d write_center: left_edge[%u]: \
2904 			   %d right_edge[%u]: %d\n", __func__, __LINE__,
2905 			   i, left_edge[i], i, right_edge[i]);
2906 
2907 		/*
2908 		 * Check for cases where we haven't found the left edge,
2909 		 * which makes our assignment of the the right edge invalid.
2910 		 * Reset it to the illegal value.
2911 		 */
2912 		if ((left_edge[i] == IO_IO_OUT1_DELAY_MAX + 1) &&
2913 		    (right_edge[i] != IO_IO_OUT1_DELAY_MAX + 1)) {
2914 			right_edge[i] = IO_IO_OUT1_DELAY_MAX + 1;
2915 			debug_cond(DLEVEL == 2, "%s:%d write_center: reset \
2916 				   right_edge[%u]: %d\n", __func__, __LINE__,
2917 				   i, right_edge[i]);
2918 		}
2919 
2920 		/*
2921 		 * Reset sticky bit (except for bits where we have
2922 		 * seen the left edge).
2923 		 */
2924 		sticky_bit_chk = sticky_bit_chk << 1;
2925 		if ((left_edge[i] != IO_IO_OUT1_DELAY_MAX + 1))
2926 			sticky_bit_chk = sticky_bit_chk | 1;
2927 
2928 		if (i == 0)
2929 			break;
2930 	}
2931 
2932 	/* Search for the right edge of the window for each bit */
2933 	addr = (u32)&sdr_scc_mgr->update;
2934 	for (d = 0; d <= IO_IO_OUT1_DELAY_MAX - start_dqs; d++) {
2935 		scc_mgr_apply_group_dqs_io_and_oct_out1(write_group,
2936 							d + start_dqs);
2937 
2938 		writel(0, addr);
2939 
2940 		/*
2941 		 * Stop searching when the read test doesn't pass AND when
2942 		 * we've seen a passing read on every bit.
2943 		 */
2944 		stop = !rw_mgr_mem_calibrate_write_test(rank_bgn, write_group,
2945 			0, PASS_ONE_BIT, &bit_chk, 0);
2946 
2947 		sticky_bit_chk = sticky_bit_chk | bit_chk;
2948 		stop = stop && (sticky_bit_chk == param->write_correct_mask);
2949 
2950 		debug_cond(DLEVEL == 2, "write_center (right): dtap=%u => %u == \
2951 			   %u && %u\n", d, sticky_bit_chk,
2952 			   param->write_correct_mask, stop);
2953 
2954 		if (stop == 1) {
2955 			if (d == 0) {
2956 				for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS;
2957 					i++) {
2958 					/* d = 0 failed, but it passed when
2959 					testing the left edge, so it must be
2960 					marginal, set it to -1 */
2961 					if (right_edge[i] ==
2962 						IO_IO_OUT1_DELAY_MAX + 1 &&
2963 						left_edge[i] !=
2964 						IO_IO_OUT1_DELAY_MAX + 1) {
2965 						right_edge[i] = -1;
2966 					}
2967 				}
2968 			}
2969 			break;
2970 		} else {
2971 			for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
2972 				if (bit_chk & 1) {
2973 					/*
2974 					 * Remember a passing test as
2975 					 * the right_edge.
2976 					 */
2977 					right_edge[i] = d;
2978 				} else {
2979 					if (d != 0) {
2980 						/*
2981 						 * If a right edge has not
2982 						 * been seen yet, then a future
2983 						 * passing test will mark this
2984 						 * edge as the left edge.
2985 						 */
2986 						if (right_edge[i] ==
2987 						    IO_IO_OUT1_DELAY_MAX + 1)
2988 							left_edge[i] = -(d + 1);
2989 					} else {
2990 						/*
2991 						 * d = 0 failed, but it passed
2992 						 * when testing the left edge,
2993 						 * so it must be marginal, set
2994 						 * it to -1.
2995 						 */
2996 						if (right_edge[i] ==
2997 						    IO_IO_OUT1_DELAY_MAX + 1 &&
2998 						    left_edge[i] !=
2999 						    IO_IO_OUT1_DELAY_MAX + 1)
3000 							right_edge[i] = -1;
3001 						/*
3002 						 * If a right edge has not been
3003 						 * seen yet, then a future
3004 						 * passing test will mark this
3005 						 * edge as the left edge.
3006 						 */
3007 						else if (right_edge[i] ==
3008 							IO_IO_OUT1_DELAY_MAX +
3009 							1)
3010 							left_edge[i] = -(d + 1);
3011 					}
3012 				}
3013 				debug_cond(DLEVEL == 2, "write_center[r,d=%d):", d);
3014 				debug_cond(DLEVEL == 2, "bit_chk_test=%d left_edge[%u]: %d",
3015 					   (int)(bit_chk & 1), i, left_edge[i]);
3016 				debug_cond(DLEVEL == 2, "right_edge[%u]: %d\n", i,
3017 					   right_edge[i]);
3018 				bit_chk = bit_chk >> 1;
3019 			}
3020 		}
3021 	}
3022 
3023 	/* Check that all bits have a window */
3024 	for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
3025 		debug_cond(DLEVEL == 2, "%s:%d write_center: left_edge[%u]: \
3026 			   %d right_edge[%u]: %d", __func__, __LINE__,
3027 			   i, left_edge[i], i, right_edge[i]);
3028 		if ((left_edge[i] == IO_IO_OUT1_DELAY_MAX + 1) ||
3029 		    (right_edge[i] == IO_IO_OUT1_DELAY_MAX + 1)) {
3030 			set_failing_group_stage(test_bgn + i,
3031 						CAL_STAGE_WRITES,
3032 						CAL_SUBSTAGE_WRITES_CENTER);
3033 			return 0;
3034 		}
3035 	}
3036 
3037 	/* Find middle of window for each DQ bit */
3038 	mid_min = left_edge[0] - right_edge[0];
3039 	min_index = 0;
3040 	for (i = 1; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
3041 		mid = left_edge[i] - right_edge[i];
3042 		if (mid < mid_min) {
3043 			mid_min = mid;
3044 			min_index = i;
3045 		}
3046 	}
3047 
3048 	/*
3049 	 * -mid_min/2 represents the amount that we need to move DQS.
3050 	 * If mid_min is odd and positive we'll need to add one to
3051 	 * make sure the rounding in further calculations is correct
3052 	 * (always bias to the right), so just add 1 for all positive values.
3053 	 */
3054 	if (mid_min > 0)
3055 		mid_min++;
3056 	mid_min = mid_min / 2;
3057 	debug_cond(DLEVEL == 1, "%s:%d write_center: mid_min=%d\n", __func__,
3058 		   __LINE__, mid_min);
3059 
3060 	/* Determine the amount we can change DQS (which is -mid_min) */
3061 	orig_mid_min = mid_min;
3062 	new_dqs = start_dqs;
3063 	mid_min = 0;
3064 	debug_cond(DLEVEL == 1, "%s:%d write_center: start_dqs=%d new_dqs=%d \
3065 		   mid_min=%d\n", __func__, __LINE__, start_dqs, new_dqs, mid_min);
3066 	/* Initialize data for export structures */
3067 	dqs_margin = IO_IO_OUT1_DELAY_MAX + 1;
3068 	dq_margin  = IO_IO_OUT1_DELAY_MAX + 1;
3069 
3070 	/* add delay to bring centre of all DQ windows to the same "level" */
3071 	addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_IO_OUT1_DELAY_OFFSET;
3072 	for (i = 0, p = test_bgn; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++, p++) {
3073 		/* Use values before divide by 2 to reduce round off error */
3074 		shift_dq = (left_edge[i] - right_edge[i] -
3075 			(left_edge[min_index] - right_edge[min_index]))/2  +
3076 		(orig_mid_min - mid_min);
3077 
3078 		debug_cond(DLEVEL == 2, "%s:%d write_center: before: shift_dq \
3079 			   [%u]=%d\n", __func__, __LINE__, i, shift_dq);
3080 
3081 		temp_dq_out1_delay = readl(addr + (i << 2));
3082 		if (shift_dq + (int32_t)temp_dq_out1_delay >
3083 			(int32_t)IO_IO_OUT1_DELAY_MAX) {
3084 			shift_dq = (int32_t)IO_IO_OUT1_DELAY_MAX - temp_dq_out1_delay;
3085 		} else if (shift_dq + (int32_t)temp_dq_out1_delay < 0) {
3086 			shift_dq = -(int32_t)temp_dq_out1_delay;
3087 		}
3088 		debug_cond(DLEVEL == 2, "write_center: after: shift_dq[%u]=%d\n",
3089 			   i, shift_dq);
3090 		scc_mgr_set_dq_out1_delay(write_group, i, temp_dq_out1_delay +
3091 					  shift_dq);
3092 		scc_mgr_load_dq(i);
3093 
3094 		debug_cond(DLEVEL == 2, "write_center: margin[%u]=[%d,%d]\n", i,
3095 			   left_edge[i] - shift_dq + (-mid_min),
3096 			   right_edge[i] + shift_dq - (-mid_min));
3097 		/* To determine values for export structures */
3098 		if (left_edge[i] - shift_dq + (-mid_min) < dq_margin)
3099 			dq_margin = left_edge[i] - shift_dq + (-mid_min);
3100 
3101 		if (right_edge[i] + shift_dq - (-mid_min) < dqs_margin)
3102 			dqs_margin = right_edge[i] + shift_dq - (-mid_min);
3103 	}
3104 
3105 	/* Move DQS */
3106 	scc_mgr_apply_group_dqs_io_and_oct_out1(write_group, new_dqs);
3107 	addr = (u32)&sdr_scc_mgr->update;
3108 	writel(0, addr);
3109 
3110 	/* Centre DM */
3111 	debug_cond(DLEVEL == 2, "%s:%d write_center: DM\n", __func__, __LINE__);
3112 
3113 	/*
3114 	 * set the left and right edge of each bit to an illegal value,
3115 	 * use (IO_IO_OUT1_DELAY_MAX + 1) as an illegal value,
3116 	 */
3117 	left_edge[0]  = IO_IO_OUT1_DELAY_MAX + 1;
3118 	right_edge[0] = IO_IO_OUT1_DELAY_MAX + 1;
3119 	int32_t bgn_curr = IO_IO_OUT1_DELAY_MAX + 1;
3120 	int32_t end_curr = IO_IO_OUT1_DELAY_MAX + 1;
3121 	int32_t bgn_best = IO_IO_OUT1_DELAY_MAX + 1;
3122 	int32_t end_best = IO_IO_OUT1_DELAY_MAX + 1;
3123 	int32_t win_best = 0;
3124 
3125 	/* Search for the/part of the window with DM shift */
3126 	addr = (u32)&sdr_scc_mgr->update;
3127 	for (d = IO_IO_OUT1_DELAY_MAX; d >= 0; d -= DELTA_D) {
3128 		scc_mgr_apply_group_dm_out1_delay(write_group, d);
3129 		writel(0, addr);
3130 
3131 		if (rw_mgr_mem_calibrate_write_test(rank_bgn, write_group, 1,
3132 						    PASS_ALL_BITS, &bit_chk,
3133 						    0)) {
3134 			/* USE Set current end of the window */
3135 			end_curr = -d;
3136 			/*
3137 			 * If a starting edge of our window has not been seen
3138 			 * this is our current start of the DM window.
3139 			 */
3140 			if (bgn_curr == IO_IO_OUT1_DELAY_MAX + 1)
3141 				bgn_curr = -d;
3142 
3143 			/*
3144 			 * If current window is bigger than best seen.
3145 			 * Set best seen to be current window.
3146 			 */
3147 			if ((end_curr-bgn_curr+1) > win_best) {
3148 				win_best = end_curr-bgn_curr+1;
3149 				bgn_best = bgn_curr;
3150 				end_best = end_curr;
3151 			}
3152 		} else {
3153 			/* We just saw a failing test. Reset temp edge */
3154 			bgn_curr = IO_IO_OUT1_DELAY_MAX + 1;
3155 			end_curr = IO_IO_OUT1_DELAY_MAX + 1;
3156 			}
3157 		}
3158 
3159 
3160 	/* Reset DM delay chains to 0 */
3161 	scc_mgr_apply_group_dm_out1_delay(write_group, 0);
3162 
3163 	/*
3164 	 * Check to see if the current window nudges up aganist 0 delay.
3165 	 * If so we need to continue the search by shifting DQS otherwise DQS
3166 	 * search begins as a new search. */
3167 	if (end_curr != 0) {
3168 		bgn_curr = IO_IO_OUT1_DELAY_MAX + 1;
3169 		end_curr = IO_IO_OUT1_DELAY_MAX + 1;
3170 	}
3171 
3172 	/* Search for the/part of the window with DQS shifts */
3173 	addr = (u32)&sdr_scc_mgr->update;
3174 	for (d = 0; d <= IO_IO_OUT1_DELAY_MAX - new_dqs; d += DELTA_D) {
3175 		/*
3176 		 * Note: This only shifts DQS, so are we limiting ourselve to
3177 		 * width of DQ unnecessarily.
3178 		 */
3179 		scc_mgr_apply_group_dqs_io_and_oct_out1(write_group,
3180 							d + new_dqs);
3181 
3182 		writel(0, addr);
3183 		if (rw_mgr_mem_calibrate_write_test(rank_bgn, write_group, 1,
3184 						    PASS_ALL_BITS, &bit_chk,
3185 						    0)) {
3186 			/* USE Set current end of the window */
3187 			end_curr = d;
3188 			/*
3189 			 * If a beginning edge of our window has not been seen
3190 			 * this is our current begin of the DM window.
3191 			 */
3192 			if (bgn_curr == IO_IO_OUT1_DELAY_MAX + 1)
3193 				bgn_curr = d;
3194 
3195 			/*
3196 			 * If current window is bigger than best seen. Set best
3197 			 * seen to be current window.
3198 			 */
3199 			if ((end_curr-bgn_curr+1) > win_best) {
3200 				win_best = end_curr-bgn_curr+1;
3201 				bgn_best = bgn_curr;
3202 				end_best = end_curr;
3203 			}
3204 		} else {
3205 			/* We just saw a failing test. Reset temp edge */
3206 			bgn_curr = IO_IO_OUT1_DELAY_MAX + 1;
3207 			end_curr = IO_IO_OUT1_DELAY_MAX + 1;
3208 
3209 			/* Early exit optimization: if ther remaining delay
3210 			chain space is less than already seen largest window
3211 			we can exit */
3212 			if ((win_best-1) >
3213 				(IO_IO_OUT1_DELAY_MAX - new_dqs - d)) {
3214 					break;
3215 				}
3216 			}
3217 		}
3218 
3219 	/* assign left and right edge for cal and reporting; */
3220 	left_edge[0] = -1*bgn_best;
3221 	right_edge[0] = end_best;
3222 
3223 	debug_cond(DLEVEL == 2, "%s:%d dm_calib: left=%d right=%d\n", __func__,
3224 		   __LINE__, left_edge[0], right_edge[0]);
3225 
3226 	/* Move DQS (back to orig) */
3227 	scc_mgr_apply_group_dqs_io_and_oct_out1(write_group, new_dqs);
3228 
3229 	/* Move DM */
3230 
3231 	/* Find middle of window for the DM bit */
3232 	mid = (left_edge[0] - right_edge[0]) / 2;
3233 
3234 	/* only move right, since we are not moving DQS/DQ */
3235 	if (mid < 0)
3236 		mid = 0;
3237 
3238 	/* dm_marign should fail if we never find a window */
3239 	if (win_best == 0)
3240 		dm_margin = -1;
3241 	else
3242 		dm_margin = left_edge[0] - mid;
3243 
3244 	scc_mgr_apply_group_dm_out1_delay(write_group, mid);
3245 	addr = (u32)&sdr_scc_mgr->update;
3246 	writel(0, addr);
3247 
3248 	debug_cond(DLEVEL == 2, "%s:%d dm_calib: left=%d right=%d mid=%d \
3249 		   dm_margin=%d\n", __func__, __LINE__, left_edge[0],
3250 		   right_edge[0], mid, dm_margin);
3251 	/* Export values */
3252 	gbl->fom_out += dq_margin + dqs_margin;
3253 
3254 	debug_cond(DLEVEL == 2, "%s:%d write_center: dq_margin=%d \
3255 		   dqs_margin=%d dm_margin=%d\n", __func__, __LINE__,
3256 		   dq_margin, dqs_margin, dm_margin);
3257 
3258 	/*
3259 	 * Do not remove this line as it makes sure all of our
3260 	 * decisions have been applied.
3261 	 */
3262 	addr = (u32)&sdr_scc_mgr->update;
3263 	writel(0, addr);
3264 	return (dq_margin >= 0) && (dqs_margin >= 0) && (dm_margin >= 0);
3265 }
3266 
3267 /* calibrate the write operations */
3268 static uint32_t rw_mgr_mem_calibrate_writes(uint32_t rank_bgn, uint32_t g,
3269 	uint32_t test_bgn)
3270 {
3271 	/* update info for sims */
3272 	debug("%s:%d %u %u\n", __func__, __LINE__, g, test_bgn);
3273 
3274 	reg_file_set_stage(CAL_STAGE_WRITES);
3275 	reg_file_set_sub_stage(CAL_SUBSTAGE_WRITES_CENTER);
3276 
3277 	reg_file_set_group(g);
3278 
3279 	if (!rw_mgr_mem_calibrate_writes_center(rank_bgn, g, test_bgn)) {
3280 		set_failing_group_stage(g, CAL_STAGE_WRITES,
3281 					CAL_SUBSTAGE_WRITES_CENTER);
3282 		return 0;
3283 	}
3284 
3285 	return 1;
3286 }
3287 
3288 /* precharge all banks and activate row 0 in bank "000..." and bank "111..." */
3289 static void mem_precharge_and_activate(void)
3290 {
3291 	uint32_t r;
3292 	uint32_t addr;
3293 
3294 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r++) {
3295 		if (param->skip_ranks[r]) {
3296 			/* request to skip the rank */
3297 			continue;
3298 		}
3299 
3300 		/* set rank */
3301 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_OFF);
3302 
3303 		/* precharge all banks ... */
3304 		addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
3305 		writel(RW_MGR_PRECHARGE_ALL, addr);
3306 
3307 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr0;
3308 		writel(0x0F, addr);
3309 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add0;
3310 		writel(RW_MGR_ACTIVATE_0_AND_1_WAIT1, addr);
3311 
3312 		addr = (u32)&sdr_rw_load_mgr_regs->load_cntr1;
3313 		writel(0x0F, addr);
3314 		addr = (u32)&sdr_rw_load_jump_mgr_regs->load_jump_add1;
3315 		writel(RW_MGR_ACTIVATE_0_AND_1_WAIT2, addr);
3316 
3317 		/* activate rows */
3318 		addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_RUN_SINGLE_GROUP_OFFSET;
3319 		writel(RW_MGR_ACTIVATE_0_AND_1, addr);
3320 	}
3321 }
3322 
3323 /* Configure various memory related parameters. */
3324 static void mem_config(void)
3325 {
3326 	uint32_t rlat, wlat;
3327 	uint32_t rw_wl_nop_cycles;
3328 	uint32_t max_latency;
3329 	uint32_t addr;
3330 
3331 	debug("%s:%d\n", __func__, __LINE__);
3332 	/* read in write and read latency */
3333 	addr = (u32)&data_mgr->t_wl_add;
3334 	wlat = readl(addr);
3335 
3336 	addr = (u32)&data_mgr->mem_t_add;
3337 	wlat += readl(addr);
3338 	/* WL for hard phy does not include additive latency */
3339 
3340 	/*
3341 	 * add addtional write latency to offset the address/command extra
3342 	 * clock cycle. We change the AC mux setting causing AC to be delayed
3343 	 * by one mem clock cycle. Only do this for DDR3
3344 	 */
3345 	wlat = wlat + 1;
3346 
3347 	addr = (u32)&data_mgr->t_rl_add;
3348 	rlat = readl(addr);
3349 
3350 	rw_wl_nop_cycles = wlat - 2;
3351 	gbl->rw_wl_nop_cycles = rw_wl_nop_cycles;
3352 
3353 	/*
3354 	 * For AV/CV, lfifo is hardened and always runs at full rate so
3355 	 * max latency in AFI clocks, used here, is correspondingly smaller.
3356 	 */
3357 	max_latency = (1<<MAX_LATENCY_COUNT_WIDTH)/1 - 1;
3358 	/* configure for a burst length of 8 */
3359 
3360 	/* write latency */
3361 	/* Adjust Write Latency for Hard PHY */
3362 	wlat = wlat + 1;
3363 
3364 	/* set a pretty high read latency initially */
3365 	gbl->curr_read_lat = rlat + 16;
3366 
3367 	if (gbl->curr_read_lat > max_latency)
3368 		gbl->curr_read_lat = max_latency;
3369 
3370 	addr = (u32)&phy_mgr_cfg->phy_rlat;
3371 	writel(gbl->curr_read_lat, addr);
3372 
3373 	/* advertise write latency */
3374 	gbl->curr_write_lat = wlat;
3375 	addr = (u32)&phy_mgr_cfg->afi_wlat;
3376 	writel(wlat - 2, addr);
3377 
3378 	/* initialize bit slips */
3379 	mem_precharge_and_activate();
3380 }
3381 
3382 /* Set VFIFO and LFIFO to instant-on settings in skip calibration mode */
3383 static void mem_skip_calibrate(void)
3384 {
3385 	uint32_t vfifo_offset;
3386 	uint32_t i, j, r;
3387 	uint32_t addr;
3388 
3389 	debug("%s:%d\n", __func__, __LINE__);
3390 	/* Need to update every shadow register set used by the interface */
3391 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS;
3392 		r += NUM_RANKS_PER_SHADOW_REG) {
3393 		/*
3394 		 * Set output phase alignment settings appropriate for
3395 		 * skip calibration.
3396 		 */
3397 		for (i = 0; i < RW_MGR_MEM_IF_READ_DQS_WIDTH; i++) {
3398 			scc_mgr_set_dqs_en_phase(i, 0);
3399 #if IO_DLL_CHAIN_LENGTH == 6
3400 			scc_mgr_set_dqdqs_output_phase(i, 6);
3401 #else
3402 			scc_mgr_set_dqdqs_output_phase(i, 7);
3403 #endif
3404 			/*
3405 			 * Case:33398
3406 			 *
3407 			 * Write data arrives to the I/O two cycles before write
3408 			 * latency is reached (720 deg).
3409 			 *   -> due to bit-slip in a/c bus
3410 			 *   -> to allow board skew where dqs is longer than ck
3411 			 *      -> how often can this happen!?
3412 			 *      -> can claim back some ptaps for high freq
3413 			 *       support if we can relax this, but i digress...
3414 			 *
3415 			 * The write_clk leads mem_ck by 90 deg
3416 			 * The minimum ptap of the OPA is 180 deg
3417 			 * Each ptap has (360 / IO_DLL_CHAIN_LENGH) deg of delay
3418 			 * The write_clk is always delayed by 2 ptaps
3419 			 *
3420 			 * Hence, to make DQS aligned to CK, we need to delay
3421 			 * DQS by:
3422 			 *    (720 - 90 - 180 - 2 * (360 / IO_DLL_CHAIN_LENGTH))
3423 			 *
3424 			 * Dividing the above by (360 / IO_DLL_CHAIN_LENGTH)
3425 			 * gives us the number of ptaps, which simplies to:
3426 			 *
3427 			 *    (1.25 * IO_DLL_CHAIN_LENGTH - 2)
3428 			 */
3429 			scc_mgr_set_dqdqs_output_phase(i, (1.25 *
3430 				IO_DLL_CHAIN_LENGTH - 2));
3431 		}
3432 		addr = (u32)&sdr_scc_mgr->dqs_ena;
3433 		writel(0xff, addr);
3434 		addr = (u32)&sdr_scc_mgr->dqs_io_ena;
3435 		writel(0xff, addr);
3436 
3437 		addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_GROUP_COUNTER_OFFSET;
3438 		for (i = 0; i < RW_MGR_MEM_IF_WRITE_DQS_WIDTH; i++) {
3439 			writel(i, addr);
3440 		}
3441 		addr = (u32)&sdr_scc_mgr->dq_ena;
3442 		writel(0xff, addr);
3443 		addr = (u32)&sdr_scc_mgr->dm_ena;
3444 		writel(0xff, addr);
3445 		addr = (u32)&sdr_scc_mgr->update;
3446 		writel(0, addr);
3447 	}
3448 
3449 	/* Compensate for simulation model behaviour */
3450 	for (i = 0; i < RW_MGR_MEM_IF_READ_DQS_WIDTH; i++) {
3451 		scc_mgr_set_dqs_bus_in_delay(i, 10);
3452 		scc_mgr_load_dqs(i);
3453 	}
3454 	addr = (u32)&sdr_scc_mgr->update;
3455 	writel(0, addr);
3456 
3457 	/*
3458 	 * ArriaV has hard FIFOs that can only be initialized by incrementing
3459 	 * in sequencer.
3460 	 */
3461 	vfifo_offset = CALIB_VFIFO_OFFSET;
3462 	addr = (u32)&phy_mgr_cmd->inc_vfifo_hard_phy;
3463 	for (j = 0; j < vfifo_offset; j++) {
3464 		writel(0xff, addr);
3465 	}
3466 	addr = (u32)&phy_mgr_cmd->fifo_reset;
3467 	writel(0, addr);
3468 
3469 	/*
3470 	 * For ACV with hard lfifo, we get the skip-cal setting from
3471 	 * generation-time constant.
3472 	 */
3473 	gbl->curr_read_lat = CALIB_LFIFO_OFFSET;
3474 	addr = (u32)&phy_mgr_cfg->phy_rlat;
3475 	writel(gbl->curr_read_lat, addr);
3476 }
3477 
3478 /* Memory calibration entry point */
3479 static uint32_t mem_calibrate(void)
3480 {
3481 	uint32_t i;
3482 	uint32_t rank_bgn, sr;
3483 	uint32_t write_group, write_test_bgn;
3484 	uint32_t read_group, read_test_bgn;
3485 	uint32_t run_groups, current_run;
3486 	uint32_t failing_groups = 0;
3487 	uint32_t group_failed = 0;
3488 	uint32_t sr_failed = 0;
3489 	uint32_t addr;
3490 
3491 	debug("%s:%d\n", __func__, __LINE__);
3492 	/* Initialize the data settings */
3493 
3494 	gbl->error_substage = CAL_SUBSTAGE_NIL;
3495 	gbl->error_stage = CAL_STAGE_NIL;
3496 	gbl->error_group = 0xff;
3497 	gbl->fom_in = 0;
3498 	gbl->fom_out = 0;
3499 
3500 	mem_config();
3501 
3502 	uint32_t bypass_mode = 0x1;
3503 	addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_GROUP_COUNTER_OFFSET;
3504 	for (i = 0; i < RW_MGR_MEM_IF_READ_DQS_WIDTH; i++) {
3505 		writel(i, addr);
3506 		scc_set_bypass_mode(i, bypass_mode);
3507 	}
3508 
3509 	if ((dyn_calib_steps & CALIB_SKIP_ALL) == CALIB_SKIP_ALL) {
3510 		/*
3511 		 * Set VFIFO and LFIFO to instant-on settings in skip
3512 		 * calibration mode.
3513 		 */
3514 		mem_skip_calibrate();
3515 	} else {
3516 		for (i = 0; i < NUM_CALIB_REPEAT; i++) {
3517 			/*
3518 			 * Zero all delay chain/phase settings for all
3519 			 * groups and all shadow register sets.
3520 			 */
3521 			scc_mgr_zero_all();
3522 
3523 			run_groups = ~param->skip_groups;
3524 
3525 			for (write_group = 0, write_test_bgn = 0; write_group
3526 				< RW_MGR_MEM_IF_WRITE_DQS_WIDTH; write_group++,
3527 				write_test_bgn += RW_MGR_MEM_DQ_PER_WRITE_DQS) {
3528 				/* Initialized the group failure */
3529 				group_failed = 0;
3530 
3531 				current_run = run_groups & ((1 <<
3532 					RW_MGR_NUM_DQS_PER_WRITE_GROUP) - 1);
3533 				run_groups = run_groups >>
3534 					RW_MGR_NUM_DQS_PER_WRITE_GROUP;
3535 
3536 				if (current_run == 0)
3537 					continue;
3538 
3539 				addr = SDR_PHYGRP_SCCGRP_ADDRESS | SCC_MGR_GROUP_COUNTER_OFFSET;
3540 				writel(write_group, addr);
3541 				scc_mgr_zero_group(write_group, write_test_bgn,
3542 						   0);
3543 
3544 				for (read_group = write_group *
3545 					RW_MGR_MEM_IF_READ_DQS_WIDTH /
3546 					RW_MGR_MEM_IF_WRITE_DQS_WIDTH,
3547 					read_test_bgn = 0;
3548 					read_group < (write_group + 1) *
3549 					RW_MGR_MEM_IF_READ_DQS_WIDTH /
3550 					RW_MGR_MEM_IF_WRITE_DQS_WIDTH &&
3551 					group_failed == 0;
3552 					read_group++, read_test_bgn +=
3553 					RW_MGR_MEM_DQ_PER_READ_DQS) {
3554 					/* Calibrate the VFIFO */
3555 					if (!((STATIC_CALIB_STEPS) &
3556 						CALIB_SKIP_VFIFO)) {
3557 						if (!rw_mgr_mem_calibrate_vfifo
3558 							(read_group,
3559 							read_test_bgn)) {
3560 							group_failed = 1;
3561 
3562 							if (!(gbl->
3563 							phy_debug_mode_flags &
3564 						PHY_DEBUG_SWEEP_ALL_GROUPS)) {
3565 								return 0;
3566 							}
3567 						}
3568 					}
3569 				}
3570 
3571 				/* Calibrate the output side */
3572 				if (group_failed == 0)	{
3573 					for (rank_bgn = 0, sr = 0; rank_bgn
3574 						< RW_MGR_MEM_NUMBER_OF_RANKS;
3575 						rank_bgn +=
3576 						NUM_RANKS_PER_SHADOW_REG,
3577 						++sr) {
3578 						sr_failed = 0;
3579 						if (!((STATIC_CALIB_STEPS) &
3580 						CALIB_SKIP_WRITES)) {
3581 							if ((STATIC_CALIB_STEPS)
3582 						& CALIB_SKIP_DELAY_SWEEPS) {
3583 						/* not needed in quick mode! */
3584 							} else {
3585 						/*
3586 						 * Determine if this set of
3587 						 * ranks should be skipped
3588 						 * entirely.
3589 						 */
3590 					if (!param->skip_shadow_regs[sr]) {
3591 						if (!rw_mgr_mem_calibrate_writes
3592 						(rank_bgn, write_group,
3593 						write_test_bgn)) {
3594 							sr_failed = 1;
3595 							if (!(gbl->
3596 							phy_debug_mode_flags &
3597 						PHY_DEBUG_SWEEP_ALL_GROUPS)) {
3598 								return 0;
3599 									}
3600 									}
3601 								}
3602 							}
3603 						}
3604 						if (sr_failed != 0)
3605 							group_failed = 1;
3606 					}
3607 				}
3608 
3609 				if (group_failed == 0) {
3610 					for (read_group = write_group *
3611 					RW_MGR_MEM_IF_READ_DQS_WIDTH /
3612 					RW_MGR_MEM_IF_WRITE_DQS_WIDTH,
3613 					read_test_bgn = 0;
3614 						read_group < (write_group + 1)
3615 						* RW_MGR_MEM_IF_READ_DQS_WIDTH
3616 						/ RW_MGR_MEM_IF_WRITE_DQS_WIDTH &&
3617 						group_failed == 0;
3618 						read_group++, read_test_bgn +=
3619 						RW_MGR_MEM_DQ_PER_READ_DQS) {
3620 						if (!((STATIC_CALIB_STEPS) &
3621 							CALIB_SKIP_WRITES)) {
3622 					if (!rw_mgr_mem_calibrate_vfifo_end
3623 						(read_group, read_test_bgn)) {
3624 							group_failed = 1;
3625 
3626 						if (!(gbl->phy_debug_mode_flags
3627 						& PHY_DEBUG_SWEEP_ALL_GROUPS)) {
3628 								return 0;
3629 								}
3630 							}
3631 						}
3632 					}
3633 				}
3634 
3635 				if (group_failed != 0)
3636 					failing_groups++;
3637 			}
3638 
3639 			/*
3640 			 * USER If there are any failing groups then report
3641 			 * the failure.
3642 			 */
3643 			if (failing_groups != 0)
3644 				return 0;
3645 
3646 			/* Calibrate the LFIFO */
3647 			if (!((STATIC_CALIB_STEPS) & CALIB_SKIP_LFIFO)) {
3648 				/*
3649 				 * If we're skipping groups as part of debug,
3650 				 * don't calibrate LFIFO.
3651 				 */
3652 				if (param->skip_groups == 0) {
3653 					if (!rw_mgr_mem_calibrate_lfifo())
3654 						return 0;
3655 				}
3656 			}
3657 		}
3658 	}
3659 
3660 	/*
3661 	 * Do not remove this line as it makes sure all of our decisions
3662 	 * have been applied.
3663 	 */
3664 	addr = (u32)&sdr_scc_mgr->update;
3665 	writel(0, addr);
3666 	return 1;
3667 }
3668 
3669 static uint32_t run_mem_calibrate(void)
3670 {
3671 	uint32_t pass;
3672 	uint32_t debug_info;
3673 	uint32_t addr;
3674 
3675 	debug("%s:%d\n", __func__, __LINE__);
3676 
3677 	/* Reset pass/fail status shown on afi_cal_success/fail */
3678 	addr = (u32)&phy_mgr_cfg->cal_status;
3679 	writel(PHY_MGR_CAL_RESET, addr);
3680 
3681 	addr = SDR_CTRLGRP_ADDRESS;
3682 	/* stop tracking manger */
3683 	uint32_t ctrlcfg = readl(addr);
3684 
3685 	addr = SDR_CTRLGRP_ADDRESS;
3686 	writel(ctrlcfg & 0xFFBFFFFF, addr);
3687 
3688 	initialize();
3689 	rw_mgr_mem_initialize();
3690 
3691 	pass = mem_calibrate();
3692 
3693 	mem_precharge_and_activate();
3694 	addr = (u32)&phy_mgr_cmd->fifo_reset;
3695 	writel(0, addr);
3696 
3697 	/*
3698 	 * Handoff:
3699 	 * Don't return control of the PHY back to AFI when in debug mode.
3700 	 */
3701 	if ((gbl->phy_debug_mode_flags & PHY_DEBUG_IN_DEBUG_MODE) == 0) {
3702 		rw_mgr_mem_handoff();
3703 		/*
3704 		 * In Hard PHY this is a 2-bit control:
3705 		 * 0: AFI Mux Select
3706 		 * 1: DDIO Mux Select
3707 		 */
3708 		addr = (u32)&phy_mgr_cfg->mux_sel;
3709 		writel(0x2, addr);
3710 	}
3711 
3712 	addr = SDR_CTRLGRP_ADDRESS;
3713 	writel(ctrlcfg, addr);
3714 
3715 	if (pass) {
3716 		printf("%s: CALIBRATION PASSED\n", __FILE__);
3717 
3718 		gbl->fom_in /= 2;
3719 		gbl->fom_out /= 2;
3720 
3721 		if (gbl->fom_in > 0xff)
3722 			gbl->fom_in = 0xff;
3723 
3724 		if (gbl->fom_out > 0xff)
3725 			gbl->fom_out = 0xff;
3726 
3727 		/* Update the FOM in the register file */
3728 		debug_info = gbl->fom_in;
3729 		debug_info |= gbl->fom_out << 8;
3730 		addr = (u32)&sdr_reg_file->fom;
3731 		writel(debug_info, addr);
3732 
3733 		addr = (u32)&phy_mgr_cfg->cal_debug_info;
3734 		writel(debug_info, addr);
3735 		addr = (u32)&phy_mgr_cfg->cal_status;
3736 		writel(PHY_MGR_CAL_SUCCESS, addr);
3737 	} else {
3738 		printf("%s: CALIBRATION FAILED\n", __FILE__);
3739 
3740 		debug_info = gbl->error_stage;
3741 		debug_info |= gbl->error_substage << 8;
3742 		debug_info |= gbl->error_group << 16;
3743 
3744 		addr = (u32)&sdr_reg_file->failing_stage;
3745 		writel(debug_info, addr);
3746 		addr = (u32)&phy_mgr_cfg->cal_debug_info;
3747 		writel(debug_info, addr);
3748 		addr = (u32)&phy_mgr_cfg->cal_status;
3749 		writel(PHY_MGR_CAL_FAIL, addr);
3750 
3751 		/* Update the failing group/stage in the register file */
3752 		debug_info = gbl->error_stage;
3753 		debug_info |= gbl->error_substage << 8;
3754 		debug_info |= gbl->error_group << 16;
3755 		addr = (u32)&sdr_reg_file->failing_stage;
3756 		writel(debug_info, addr);
3757 	}
3758 
3759 	return pass;
3760 }
3761 
3762 static void hc_initialize_rom_data(void)
3763 {
3764 	uint32_t i;
3765 	uint32_t addr;
3766 
3767 	addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_INST_ROM_WRITE_OFFSET;
3768 	for (i = 0; i < ARRAY_SIZE(inst_rom_init); i++) {
3769 		uint32_t data = inst_rom_init[i];
3770 		writel(data, addr + (i << 2));
3771 	}
3772 
3773 	addr = SDR_PHYGRP_RWMGRGRP_ADDRESS | RW_MGR_AC_ROM_WRITE_OFFSET;
3774 	for (i = 0; i < ARRAY_SIZE(ac_rom_init); i++) {
3775 		uint32_t data = ac_rom_init[i];
3776 		writel(data, addr + (i << 2));
3777 	}
3778 }
3779 
3780 static void initialize_reg_file(void)
3781 {
3782 	uint32_t addr;
3783 
3784 	/* Initialize the register file with the correct data */
3785 	addr = (u32)&sdr_reg_file->signature;
3786 	writel(REG_FILE_INIT_SEQ_SIGNATURE, addr);
3787 
3788 	addr = (u32)&sdr_reg_file->debug_data_addr;
3789 	writel(0, addr);
3790 
3791 	addr = (u32)&sdr_reg_file->cur_stage;
3792 	writel(0, addr);
3793 
3794 	addr = (u32)&sdr_reg_file->fom;
3795 	writel(0, addr);
3796 
3797 	addr = (u32)&sdr_reg_file->failing_stage;
3798 	writel(0, addr);
3799 
3800 	addr = (u32)&sdr_reg_file->debug1;
3801 	writel(0, addr);
3802 
3803 	addr = (u32)&sdr_reg_file->debug2;
3804 	writel(0, addr);
3805 }
3806 
3807 static void initialize_hps_phy(void)
3808 {
3809 	uint32_t reg;
3810 	uint32_t addr;
3811 	/*
3812 	 * Tracking also gets configured here because it's in the
3813 	 * same register.
3814 	 */
3815 	uint32_t trk_sample_count = 7500;
3816 	uint32_t trk_long_idle_sample_count = (10 << 16) | 100;
3817 	/*
3818 	 * Format is number of outer loops in the 16 MSB, sample
3819 	 * count in 16 LSB.
3820 	 */
3821 
3822 	reg = 0;
3823 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_ACDELAYEN_SET(2);
3824 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_DQDELAYEN_SET(1);
3825 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_DQSDELAYEN_SET(1);
3826 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_DQSLOGICDELAYEN_SET(1);
3827 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_RESETDELAYEN_SET(0);
3828 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_LPDDRDIS_SET(1);
3829 	/*
3830 	 * This field selects the intrinsic latency to RDATA_EN/FULL path.
3831 	 * 00-bypass, 01- add 5 cycles, 10- add 10 cycles, 11- add 15 cycles.
3832 	 */
3833 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_ADDLATSEL_SET(0);
3834 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_SAMPLECOUNT_19_0_SET(
3835 		trk_sample_count);
3836 	addr = SDR_CTRLGRP_ADDRESS;
3837 	writel(reg, addr + SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_OFFSET);
3838 
3839 	reg = 0;
3840 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_1_SAMPLECOUNT_31_20_SET(
3841 		trk_sample_count >>
3842 		SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_SAMPLECOUNT_19_0_WIDTH);
3843 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_1_LONGIDLESAMPLECOUNT_19_0_SET(
3844 		trk_long_idle_sample_count);
3845 	writel(reg, addr + SDR_CTRLGRP_PHYCTRL_PHYCTRL_1_OFFSET);
3846 
3847 	reg = 0;
3848 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_2_LONGIDLESAMPLECOUNT_31_20_SET(
3849 		trk_long_idle_sample_count >>
3850 		SDR_CTRLGRP_PHYCTRL_PHYCTRL_1_LONGIDLESAMPLECOUNT_19_0_WIDTH);
3851 	writel(reg, addr + SDR_CTRLGRP_PHYCTRL_PHYCTRL_2_OFFSET);
3852 }
3853 
3854 static void initialize_tracking(void)
3855 {
3856 	uint32_t concatenated_longidle = 0x0;
3857 	uint32_t concatenated_delays = 0x0;
3858 	uint32_t concatenated_rw_addr = 0x0;
3859 	uint32_t concatenated_refresh = 0x0;
3860 	uint32_t trk_sample_count = 7500;
3861 	uint32_t dtaps_per_ptap;
3862 	uint32_t tmp_delay;
3863 	uint32_t addr;
3864 
3865 	/*
3866 	 * compute usable version of value in case we skip full
3867 	 * computation later
3868 	 */
3869 	dtaps_per_ptap = 0;
3870 	tmp_delay = 0;
3871 	while (tmp_delay < IO_DELAY_PER_OPA_TAP) {
3872 		dtaps_per_ptap++;
3873 		tmp_delay += IO_DELAY_PER_DCHAIN_TAP;
3874 	}
3875 	dtaps_per_ptap--;
3876 
3877 	concatenated_longidle = concatenated_longidle ^ 10;
3878 		/*longidle outer loop */
3879 	concatenated_longidle = concatenated_longidle << 16;
3880 	concatenated_longidle = concatenated_longidle ^ 100;
3881 		/*longidle sample count */
3882 	concatenated_delays = concatenated_delays ^ 243;
3883 		/* trfc, worst case of 933Mhz 4Gb */
3884 	concatenated_delays = concatenated_delays << 8;
3885 	concatenated_delays = concatenated_delays ^ 14;
3886 		/* trcd, worst case */
3887 	concatenated_delays = concatenated_delays << 8;
3888 	concatenated_delays = concatenated_delays ^ 10;
3889 		/* vfifo wait */
3890 	concatenated_delays = concatenated_delays << 8;
3891 	concatenated_delays = concatenated_delays ^ 4;
3892 		/* mux delay */
3893 
3894 	concatenated_rw_addr = concatenated_rw_addr ^ RW_MGR_IDLE;
3895 	concatenated_rw_addr = concatenated_rw_addr << 8;
3896 	concatenated_rw_addr = concatenated_rw_addr ^ RW_MGR_ACTIVATE_1;
3897 	concatenated_rw_addr = concatenated_rw_addr << 8;
3898 	concatenated_rw_addr = concatenated_rw_addr ^ RW_MGR_SGLE_READ;
3899 	concatenated_rw_addr = concatenated_rw_addr << 8;
3900 	concatenated_rw_addr = concatenated_rw_addr ^ RW_MGR_PRECHARGE_ALL;
3901 
3902 	concatenated_refresh = concatenated_refresh ^ RW_MGR_REFRESH_ALL;
3903 	concatenated_refresh = concatenated_refresh << 24;
3904 	concatenated_refresh = concatenated_refresh ^ 1000; /* trefi */
3905 
3906 	/* Initialize the register file with the correct data */
3907 	addr = (u32)&sdr_reg_file->dtaps_per_ptap;
3908 	writel(dtaps_per_ptap, addr);
3909 
3910 	addr = (u32)&sdr_reg_file->trk_sample_count;
3911 	writel(trk_sample_count, addr);
3912 
3913 	addr = (u32)&sdr_reg_file->trk_longidle;
3914 	writel(concatenated_longidle, addr);
3915 
3916 	addr = (u32)&sdr_reg_file->delays;
3917 	writel(concatenated_delays, addr);
3918 
3919 	addr = (u32)&sdr_reg_file->trk_rw_mgr_addr;
3920 	writel(concatenated_rw_addr, addr);
3921 
3922 	addr = (u32)&sdr_reg_file->trk_read_dqs_width;
3923 	writel(RW_MGR_MEM_IF_READ_DQS_WIDTH, addr);
3924 
3925 	addr = (u32)&sdr_reg_file->trk_rfsh;
3926 	writel(concatenated_refresh, addr);
3927 }
3928 
3929 int sdram_calibration_full(void)
3930 {
3931 	struct param_type my_param;
3932 	struct gbl_type my_gbl;
3933 	uint32_t pass;
3934 	uint32_t i;
3935 
3936 	param = &my_param;
3937 	gbl = &my_gbl;
3938 
3939 	/* Initialize the debug mode flags */
3940 	gbl->phy_debug_mode_flags = 0;
3941 	/* Set the calibration enabled by default */
3942 	gbl->phy_debug_mode_flags |= PHY_DEBUG_ENABLE_CAL_RPT;
3943 	/*
3944 	 * Only sweep all groups (regardless of fail state) by default
3945 	 * Set enabled read test by default.
3946 	 */
3947 #if DISABLE_GUARANTEED_READ
3948 	gbl->phy_debug_mode_flags |= PHY_DEBUG_DISABLE_GUARANTEED_READ;
3949 #endif
3950 	/* Initialize the register file */
3951 	initialize_reg_file();
3952 
3953 	/* Initialize any PHY CSR */
3954 	initialize_hps_phy();
3955 
3956 	scc_mgr_initialize();
3957 
3958 	initialize_tracking();
3959 
3960 	/* USER Enable all ranks, groups */
3961 	for (i = 0; i < RW_MGR_MEM_NUMBER_OF_RANKS; i++)
3962 		param->skip_ranks[i] = 0;
3963 	for (i = 0; i < NUM_SHADOW_REGS; ++i)
3964 		param->skip_shadow_regs[i] = 0;
3965 	param->skip_groups = 0;
3966 
3967 	printf("%s: Preparing to start memory calibration\n", __FILE__);
3968 
3969 	debug("%s:%d\n", __func__, __LINE__);
3970 	debug_cond(DLEVEL == 1,
3971 		   "DDR3 FULL_RATE ranks=%u cs/dimm=%u dq/dqs=%u,%u vg/dqs=%u,%u ",
3972 		   RW_MGR_MEM_NUMBER_OF_RANKS, RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM,
3973 		   RW_MGR_MEM_DQ_PER_READ_DQS, RW_MGR_MEM_DQ_PER_WRITE_DQS,
3974 		   RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS,
3975 		   RW_MGR_MEM_VIRTUAL_GROUPS_PER_WRITE_DQS);
3976 	debug_cond(DLEVEL == 1,
3977 		   "dqs=%u,%u dq=%u dm=%u ptap_delay=%u dtap_delay=%u ",
3978 		   RW_MGR_MEM_IF_READ_DQS_WIDTH, RW_MGR_MEM_IF_WRITE_DQS_WIDTH,
3979 		   RW_MGR_MEM_DATA_WIDTH, RW_MGR_MEM_DATA_MASK_WIDTH,
3980 		   IO_DELAY_PER_OPA_TAP, IO_DELAY_PER_DCHAIN_TAP);
3981 	debug_cond(DLEVEL == 1, "dtap_dqsen_delay=%u, dll=%u",
3982 		   IO_DELAY_PER_DQS_EN_DCHAIN_TAP, IO_DLL_CHAIN_LENGTH);
3983 	debug_cond(DLEVEL == 1, "max values: en_p=%u dqdqs_p=%u en_d=%u dqs_in_d=%u ",
3984 		   IO_DQS_EN_PHASE_MAX, IO_DQDQS_OUT_PHASE_MAX,
3985 		   IO_DQS_EN_DELAY_MAX, IO_DQS_IN_DELAY_MAX);
3986 	debug_cond(DLEVEL == 1, "io_in_d=%u io_out1_d=%u io_out2_d=%u ",
3987 		   IO_IO_IN_DELAY_MAX, IO_IO_OUT1_DELAY_MAX,
3988 		   IO_IO_OUT2_DELAY_MAX);
3989 	debug_cond(DLEVEL == 1, "dqs_in_reserve=%u dqs_out_reserve=%u\n",
3990 		   IO_DQS_IN_RESERVE, IO_DQS_OUT_RESERVE);
3991 
3992 	hc_initialize_rom_data();
3993 
3994 	/* update info for sims */
3995 	reg_file_set_stage(CAL_STAGE_NIL);
3996 	reg_file_set_group(0);
3997 
3998 	/*
3999 	 * Load global needed for those actions that require
4000 	 * some dynamic calibration support.
4001 	 */
4002 	dyn_calib_steps = STATIC_CALIB_STEPS;
4003 	/*
4004 	 * Load global to allow dynamic selection of delay loop settings
4005 	 * based on calibration mode.
4006 	 */
4007 	if (!(dyn_calib_steps & CALIB_SKIP_DELAY_LOOPS))
4008 		skip_delay_mask = 0xff;
4009 	else
4010 		skip_delay_mask = 0x0;
4011 
4012 	pass = run_mem_calibrate();
4013 
4014 	printf("%s: Calibration complete\n", __FILE__);
4015 	return pass;
4016 }
4017